Search in sources :

Example 41 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class ForkParserTest method testPoolSizeReached.

@Test
public void testPoolSizeReached() throws Exception {
    final ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser());
    try {
        final Semaphore barrier = new Semaphore(0);
        Thread[] threads = new Thread[parser.getPoolSize()];
        PipedOutputStream[] pipes = new PipedOutputStream[threads.length];
        final ParseContext context = new ParseContext();
        for (int i = 0; i < threads.length; i++) {
            final PipedInputStream input = new PipedInputStream() {

                @Override
                public synchronized int read() throws IOException {
                    barrier.release();
                    return super.read();
                }
            };
            pipes[i] = new PipedOutputStream(input);
            threads[i] = new Thread() {

                public void run() {
                    try {
                        ContentHandler o = new DefaultHandler();
                        parser.parse(input, o, new Metadata(), context);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            };
            threads[i].start();
        }
        // Wait until all the background parsers have been started
        barrier.acquire(parser.getPoolSize());
        final ContentHandler o = new BodyContentHandler();
        Thread blocked = new Thread() {

            public void run() {
                try {
                    barrier.release();
                    InputStream stream = new ByteArrayInputStream(new byte[0]);
                    parser.parse(stream, o, new Metadata(), context);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        };
        blocked.start();
        // Wait until the last thread is started, and then some to
        // make sure that it would have had a chance to start processing
        // data had it not been blocked.
        barrier.acquire();
        Thread.sleep(1000);
        assertEquals("", o.toString());
        for (int i = 0; i < threads.length; i++) {
            pipes[i].close();
            threads[i].join();
        }
        blocked.join();
        assertEquals("Hello, World!", o.toString().trim());
    } finally {
        parser.close();
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) PipedInputStream(java.io.PipedInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) PipedOutputStream(java.io.PipedOutputStream) Semaphore(java.util.concurrent.Semaphore) PipedInputStream(java.io.PipedInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) IOException(java.io.IOException) TikaException(org.apache.tika.exception.TikaException) DefaultHandler(org.xml.sax.helpers.DefaultHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test)

Example 42 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class RecursiveParserWrapperFSConsumer method processFileResource.

@Override
public boolean processFileResource(FileResource fileResource) {
    Parser wrapped = parserFactory.getParser(tikaConfig);
    RecursiveParserWrapper parser = new RecursiveParserWrapper(wrapped, contentHandlerFactory);
    ParseContext context = new ParseContext();
    //        if (parseRecursively == true) {
    context.set(Parser.class, parser);
    //        }
    //try to open outputstream first
    OutputStream os = getOutputStream(fsOSFactory, fileResource);
    if (os == null) {
        LOG.debug("Skipping: {}", fileResource.getMetadata().get(FSProperties.FS_REL_PATH));
        return false;
    }
    //try to open the inputstream before the parse.
    //if the parse hangs or throws a nasty exception, at least there will
    //be a zero byte file there so that the batchrunner can skip that problematic
    //file during the next run.
    InputStream is = getInputStream(fileResource);
    if (is == null) {
        IOUtils.closeQuietly(os);
        return false;
    }
    Throwable thrown = null;
    List<Metadata> metadataList = null;
    Metadata containerMetadata = fileResource.getMetadata();
    try {
        parse(fileResource.getResourceId(), parser, is, new DefaultHandler(), containerMetadata, context);
        metadataList = parser.getMetadata();
    } catch (Throwable t) {
        thrown = t;
        metadataList = parser.getMetadata();
        if (metadataList == null) {
            metadataList = new LinkedList<>();
        }
        Metadata m = null;
        if (metadataList.size() == 0) {
            m = containerMetadata;
        } else {
            //take the top metadata item
            m = metadataList.remove(0);
        }
        String stackTrace = ExceptionUtils.getFilteredStackTrace(t);
        m.add(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime", stackTrace);
        metadataList.add(0, m);
    } finally {
        IOUtils.closeQuietly(is);
    }
    Writer writer = null;
    try {
        writer = new OutputStreamWriter(os, getOutputEncoding());
        JsonMetadataList.toJson(metadataList, writer);
    } catch (Exception e) {
        //this is a stop the world kind of thing
        LOG.error("{}", getXMLifiedLogMsg(IO_OS + "json", fileResource.getResourceId(), e));
        throw new RuntimeException(e);
    } finally {
        flushAndClose(writer);
    }
    if (thrown != null) {
        if (thrown instanceof Error) {
            throw (Error) thrown;
        } else {
            return false;
        }
    }
    return true;
}
Also used : InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) Metadata(org.apache.tika.metadata.Metadata) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) LinkedList(java.util.LinkedList) Parser(org.apache.tika.parser.Parser) DefaultHandler(org.xml.sax.helpers.DefaultHandler) ParseContext(org.apache.tika.parser.ParseContext) OutputStreamWriter(java.io.OutputStreamWriter) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter)

Example 43 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class TesseractOCRParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
    TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
    //  occur if someone directly calls this parser, not via DefaultParser or similar
    if (!hasTesseract(config))
        return;
    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
        //trigger the spooling to a tmp file if the stream wasn't
        //already a TikaInputStream that contained a file
        tikaStream.getPath();
        //this is the text output file name specified on the tesseract
        //commandline.  The actual output file name will have a suffix added.
        File tmpOCROutputFile = tmp.createTemporaryFile();
        // Temporary workaround for TIKA-1445 - until we can specify
        //  composite parsers with strategies (eg Composite, Try In Turn),
        //  always send the image onwards to the regular parser to have
        //  the metadata for them extracted as well
        _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext);
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        parse(tikaStream, tmpOCROutputFile, parseContext, xhtml, config);
        xhtml.endDocument();
    } finally {
        tmp.dispose();
    }
}
Also used : TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) File(java.io.File) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 44 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class EpubParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Because an EPub file is often made up of multiple XHTML files,
    //  we need explicit control over the start and end of the document
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    ContentHandler childHandler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));
    ZipInputStream zip = new ZipInputStream(stream);
    ZipEntry entry = zip.getNextEntry();
    while (entry != null) {
        if (entry.getName().equals("mimetype")) {
            String type = IOUtils.toString(zip, UTF_8);
            //often has trailing new lines
            if (type != null) {
                type = type.trim();
            }
            metadata.set(Metadata.CONTENT_TYPE, type);
        } else if (entry.getName().equals("metadata.xml")) {
            meta.parse(zip, new DefaultHandler(), metadata, context);
        } else if (entry.getName().endsWith(".opf")) {
            meta.parse(zip, new DefaultHandler(), metadata, context);
        } else if (entry.getName().endsWith(".html") || entry.getName().endsWith(".xhtml")) {
            content.parse(zip, childHandler, metadata, context);
        }
        entry = zip.getNextEntry();
    }
    // Finish everything
    xhtml.endDocument();
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ZipInputStream(java.util.zip.ZipInputStream) ZipEntry(java.util.zip.ZipEntry) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) ContentHandler(org.xml.sax.ContentHandler) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 45 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class TesseractOCRParserTest method runOCR.

private String runOCR(String resource, String[] nonOCRContains, int numMetadatas, BasicContentHandlerFactory.HANDLER_TYPE handlerType, TesseractOCRConfig.OUTPUT_TYPE outputType) throws Exception {
    TesseractOCRConfig config = new TesseractOCRConfig();
    config.setOutputType(outputType);
    Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory(handlerType, -1));
    PDFParserConfig pdfConfig = new PDFParserConfig();
    pdfConfig.setExtractInlineImages(true);
    ParseContext parseContext = new ParseContext();
    parseContext.set(TesseractOCRConfig.class, config);
    parseContext.set(Parser.class, parser);
    parseContext.set(PDFParserConfig.class, pdfConfig);
    try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) {
        parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
    }
    List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
    assertEquals(numMetadatas, metadataList.size());
    StringBuilder contents = new StringBuilder();
    for (Metadata m : metadataList) {
        contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
    }
    for (String needle : nonOCRContains) {
        assertContains(needle, contents.toString());
    }
    assertTrue(metadataList.get(0).names().length > 10);
    assertTrue(metadataList.get(1).names().length > 10);
    //test at least one value
    assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
    return contents.toString();
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) ExternalParser(org.apache.tika.parser.external.ExternalParser) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ImageParser(org.apache.tika.parser.image.ImageParser) DefaultParser(org.apache.tika.parser.DefaultParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) PDFParserConfig(org.apache.tika.parser.pdf.PDFParserConfig)

Aggregations

DefaultHandler (org.xml.sax.helpers.DefaultHandler)148 InputStream (java.io.InputStream)65 Metadata (org.apache.tika.metadata.Metadata)59 ParseContext (org.apache.tika.parser.ParseContext)52 Test (org.junit.Test)44 Attributes (org.xml.sax.Attributes)41 SAXParser (javax.xml.parsers.SAXParser)40 SAXException (org.xml.sax.SAXException)39 ByteArrayInputStream (java.io.ByteArrayInputStream)32 SAXParserFactory (javax.xml.parsers.SAXParserFactory)29 IOException (java.io.IOException)26 InputSource (org.xml.sax.InputSource)23 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)22 Parser (org.apache.tika.parser.Parser)22 TikaInputStream (org.apache.tika.io.TikaInputStream)20 ContentHandler (org.xml.sax.ContentHandler)20 File (java.io.File)19 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)17 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)16 FileInputStream (java.io.FileInputStream)15