Search in sources :

Example 51 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class OldExcelParserTest method testPlainText.

/**
     * Check we can get the plain text properly
     */
@Test
public void testPlainText() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (TikaInputStream stream = getTestFile(file)) {
        new OldExcelParser().parse(stream, handler, metadata, new ParseContext());
    }
    String text = handler.toString();
    // Check we find a few words we expect in there
    assertContains("Size", text);
    assertContains("Returns", text);
    // Check we find a few numbers we expect in there
    assertContains("11", text);
    assertContains("784", text);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) TikaInputStream(org.apache.tika.io.TikaInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 52 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class AbstractPOIContainerExtractionTest method process.

protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
    try (TikaInputStream stream = getTestFile(filename)) {
        assertEquals(true, extractor.isSupported(stream));
        // Process it
        TrackingHandler handler = new TrackingHandler();
        if (recurse) {
            extractor.extract(stream, extractor, handler);
        } else {
            extractor.extract(stream, null, handler);
        }
        // So they can check what happened
        return handler;
    }
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream)

Example 53 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class FSBatchProcessCLI method execute.

private void execute(String[] args) throws Exception {
    CommandLineParser cliParser = new DefaultParser();
    CommandLine line = cliParser.parse(options, args);
    if (line.hasOption("help")) {
        usage();
        System.exit(BatchProcessDriverCLI.PROCESS_NO_RESTART_EXIT_CODE);
    }
    Map<String, String> mapArgs = new HashMap<String, String>();
    for (Option option : line.getOptions()) {
        String v = option.getValue();
        if (v == null || v.equals("")) {
            v = "true";
        }
        mapArgs.put(option.getOpt(), v);
    }
    BatchProcessBuilder b = new BatchProcessBuilder();
    TikaInputStream is = null;
    BatchProcess process = null;
    try {
        is = getConfigInputStream(args, false);
        process = b.build(is, mapArgs);
    } finally {
        IOUtils.closeQuietly(is);
    }
    final Thread mainThread = Thread.currentThread();
    ExecutorService executor = Executors.newSingleThreadExecutor();
    Future<ParallelFileProcessingResult> futureResult = executor.submit(process);
    ParallelFileProcessingResult result = futureResult.get();
    System.out.println(FINISHED_STRING);
    System.out.println("\n");
    System.out.println(result.toString());
    System.exit(result.getExitStatus());
}
Also used : HashMap(java.util.HashMap) BatchProcess(org.apache.tika.batch.BatchProcess) TikaInputStream(org.apache.tika.io.TikaInputStream) CommandLine(org.apache.commons.cli.CommandLine) ParallelFileProcessingResult(org.apache.tika.batch.ParallelFileProcessingResult) BatchProcessBuilder(org.apache.tika.batch.builders.BatchProcessBuilder) ExecutorService(java.util.concurrent.ExecutorService) Option(org.apache.commons.cli.Option) CommandLineParser(org.apache.commons.cli.CommandLineParser) DefaultParser(org.apache.commons.cli.DefaultParser)

Example 54 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class TikaGUI method handleStream.

private void handleStream(InputStream input, Metadata md) throws Exception {
    StringWriter htmlBuffer = new StringWriter();
    StringWriter textBuffer = new StringWriter();
    StringWriter textMainBuffer = new StringWriter();
    StringWriter xmlBuffer = new StringWriter();
    StringBuilder metadataBuffer = new StringBuilder();
    ContentHandler handler = new TeeContentHandler(getHtmlHandler(htmlBuffer), getTextContentHandler(textBuffer), getTextMainContentHandler(textMainBuffer), getXmlContentHandler(xmlBuffer));
    context.set(DocumentSelector.class, new ImageDocumentSelector());
    input = TikaInputStream.get(new ProgressMonitorInputStream(this, "Parsing stream", input));
    if (input.markSupported()) {
        int mark = -1;
        if (input instanceof TikaInputStream) {
            if (((TikaInputStream) input).hasFile()) {
                mark = (int) ((TikaInputStream) input).getLength();
            }
        }
        if (mark == -1) {
            mark = MAX_MARK;
        }
        input.mark(mark);
    }
    parser.parse(input, handler, md, context);
    String[] names = md.names();
    Arrays.sort(names);
    for (String name : names) {
        for (String val : md.getValues(name)) {
            metadataBuffer.append(name);
            metadataBuffer.append(": ");
            metadataBuffer.append(val);
            metadataBuffer.append("\n");
        }
    }
    String name = md.get(Metadata.RESOURCE_NAME_KEY);
    if (name != null && name.length() > 0) {
        setTitle("Apache Tika: " + name);
    } else {
        setTitle("Apache Tika: unnamed document");
    }
    setText(metadata, metadataBuffer.toString());
    setText(xml, xmlBuffer.toString());
    setText(text, textBuffer.toString());
    setText(textMain, textMainBuffer.toString());
    setText(html, htmlBuffer.toString());
    if (!input.markSupported()) {
        setText(json, "InputStream does not support mark/reset for Recursive Parsing");
        layout.show(cards, "metadata");
        return;
    }
    boolean isReset = false;
    try {
        input.reset();
        isReset = true;
    } catch (IOException e) {
        setText(json, "Error during stream reset.\n" + "There's a limit of " + MAX_MARK + " bytes for this type of processing in the GUI.\n" + "Try the app with command line argument of -J.");
    }
    if (isReset) {
        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
        wrapper.parse(input, null, new Metadata(), new ParseContext());
        StringWriter jsonBuffer = new StringWriter();
        JsonMetadataList.setPrettyPrinting(true);
        JsonMetadataList.toJson(wrapper.getMetadata(), jsonBuffer);
        setText(json, jsonBuffer.toString());
    }
    layout.show(cards, "metadata");
}
Also used : ProgressMonitorInputStream(javax.swing.ProgressMonitorInputStream) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) IOException(java.io.IOException) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) StringWriter(java.io.StringWriter) ParseContext(org.apache.tika.parser.ParseContext) TeeContentHandler(org.apache.tika.sax.TeeContentHandler)

Example 55 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class OfficeParser method parse.

/**
     * Extracts properties and text from an MS Document input stream
     */
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    configure(context);
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    final DirectoryNode root;
    TikaInputStream tstream = TikaInputStream.cast(stream);
    NPOIFSFileSystem mustCloseFs = null;
    try {
        if (tstream == null) {
            mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
            root = mustCloseFs.getRoot();
        } else {
            final Object container = tstream.getOpenContainer();
            if (container instanceof NPOIFSFileSystem) {
                root = ((NPOIFSFileSystem) container).getRoot();
            } else if (container instanceof DirectoryNode) {
                root = (DirectoryNode) container;
            } else {
                NPOIFSFileSystem fs = null;
                if (tstream.hasFile()) {
                    fs = new NPOIFSFileSystem(tstream.getFile(), true);
                } else {
                    fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
                }
                //tstream will close the fs, no need to close this below
                tstream.setOpenContainer(fs);
                root = fs.getRoot();
            }
        }
        parse(root, context, metadata, xhtml);
        OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);
        if (officeParserConfig.getExtractMacros()) {
            //now try to get macros
            extractMacros(root.getNFileSystem(), xhtml, EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
        }
    } finally {
        IOUtils.closeQuietly(mustCloseFs);
    }
    xhtml.endDocument();
}
Also used : NPOIFSFileSystem(org.apache.poi.poifs.filesystem.NPOIFSFileSystem) TikaInputStream(org.apache.tika.io.TikaInputStream) DirectoryNode(org.apache.poi.poifs.filesystem.DirectoryNode) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Aggregations

TikaInputStream (org.apache.tika.io.TikaInputStream)100 Metadata (org.apache.tika.metadata.Metadata)40 TemporaryResources (org.apache.tika.io.TemporaryResources)28 IOException (java.io.IOException)27 TikaException (org.apache.tika.exception.TikaException)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)23 Test (org.junit.Test)20 InputStream (java.io.InputStream)19 File (java.io.File)15 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)15 ContentHandler (org.xml.sax.ContentHandler)14 TikaTest (org.apache.tika.TikaTest)13 MediaType (org.apache.tika.mime.MediaType)13 SAXException (org.xml.sax.SAXException)13 ParseContext (org.apache.tika.parser.ParseContext)12 ParserContainerExtractor (org.apache.tika.extractor.ParserContainerExtractor)8 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)6 NPOIFSFileSystem (org.apache.poi.poifs.filesystem.NPOIFSFileSystem)6 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)6 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)6