Search in sources :

Example 16 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class JackcessParserTest method testBasic.

@Test
public void testBasic() throws Exception {
    Parser p = new AutoDetectParser();
    RecursiveParserWrapper w = new RecursiveParserWrapper(p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
    for (String fName : new String[] { "testAccess2.accdb", "testAccess2_2000.mdb", "testAccess2_2002-2003.mdb" }) {
        InputStream is = null;
        try {
            is = this.getResourceAsStream("/test-documents/" + fName);
            Metadata meta = new Metadata();
            ParseContext c = new ParseContext();
            w.parse(is, new DefaultHandler(), meta, c);
        } finally {
            IOUtils.closeQuietly(is);
        }
        List<Metadata> list = w.getMetadata();
        assertEquals(4, list.size());
        String mainContent = list.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
        //make sure there's a thead and tbody
        assertContains("</thead><tbody>", mainContent);
        //assert table header
        assertContains("<th>ShortTextField</th>", mainContent);
        //test date format
        assertContains("6/24/15", mainContent);
        //test that markup is stripped
        assertContains("over the bold italic dog", mainContent);
        //test unicode
        assertContains("普林斯顿大学", mainContent);
        //test embedded document handling
        assertContains("Test Document with embedded pdf", list.get(3).get(RecursiveParserWrapper.TIKA_CONTENT));
        w.reset();
    }
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 17 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class BasicTikaFSConsumersBuilder method getOutputStreamFactory.

private OutputStreamFactory getOutputStreamFactory(Node node, Map<String, String> runtimeAttributes, ContentHandlerFactory contentHandlerFactory, boolean useRecursiveParserWrapper) {
    Map<String, String> attrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
    Path outputDir = PropsUtil.getPath(attrs.get("outputDir"), null);
    /*        FSUtil.HANDLE_EXISTING handleExisting = null;
        String handleExistingString = attrs.get("handleExisting");
        if (handleExistingString == null) {
            handleExistingException();
        } else if (handleExistingString.equals("overwrite")){
            handleExisting = FSUtil.HANDLE_EXISTING.OVERWRITE;
        } else if (handleExistingString.equals("rename")) {
            handleExisting = FSUtil.HANDLE_EXISTING.RENAME;
        } else if (handleExistingString.equals("skip")) {
            handleExisting = FSUtil.HANDLE_EXISTING.SKIP;
        } else {
            handleExistingException();
        }
*/
    String compressionString = attrs.get("compression");
    FSOutputStreamFactory.COMPRESSION compression = FSOutputStreamFactory.COMPRESSION.NONE;
    if (compressionString == null) {
    //do nothing
    } else if (compressionString.contains("bz")) {
        compression = FSOutputStreamFactory.COMPRESSION.BZIP2;
    } else if (compressionString.contains("gz")) {
        compression = FSOutputStreamFactory.COMPRESSION.GZIP;
    } else if (compressionString.contains("zip")) {
        compression = FSOutputStreamFactory.COMPRESSION.ZIP;
    }
    String suffix = attrs.get("outputSuffix");
    //suffix should not start with "."
    if (suffix == null) {
        StringBuilder sb = new StringBuilder();
        if (useRecursiveParserWrapper) {
            sb.append("json");
        } else if (contentHandlerFactory instanceof BasicContentHandlerFactory) {
            appendSuffix(((BasicContentHandlerFactory) contentHandlerFactory).getType(), sb);
        }
        appendCompression(compression, sb);
        suffix = sb.toString();
    }
    //if the driver restarts and this is set to overwrite...
    return new FSOutputStreamFactory(outputDir, FSUtil.HANDLE_EXISTING.SKIP, compression, suffix);
}
Also used : Path(java.nio.file.Path) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) FSOutputStreamFactory(org.apache.tika.batch.fs.FSOutputStreamFactory)

Example 18 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class TikaGUI method handleStream.

private void handleStream(InputStream input, Metadata md) throws Exception {
    StringWriter htmlBuffer = new StringWriter();
    StringWriter textBuffer = new StringWriter();
    StringWriter textMainBuffer = new StringWriter();
    StringWriter xmlBuffer = new StringWriter();
    StringBuilder metadataBuffer = new StringBuilder();
    ContentHandler handler = new TeeContentHandler(getHtmlHandler(htmlBuffer), getTextContentHandler(textBuffer), getTextMainContentHandler(textMainBuffer), getXmlContentHandler(xmlBuffer));
    context.set(DocumentSelector.class, new ImageDocumentSelector());
    input = TikaInputStream.get(new ProgressMonitorInputStream(this, "Parsing stream", input));
    if (input.markSupported()) {
        int mark = -1;
        if (input instanceof TikaInputStream) {
            if (((TikaInputStream) input).hasFile()) {
                mark = (int) ((TikaInputStream) input).getLength();
            }
        }
        if (mark == -1) {
            mark = MAX_MARK;
        }
        input.mark(mark);
    }
    parser.parse(input, handler, md, context);
    String[] names = md.names();
    Arrays.sort(names);
    for (String name : names) {
        for (String val : md.getValues(name)) {
            metadataBuffer.append(name);
            metadataBuffer.append(": ");
            metadataBuffer.append(val);
            metadataBuffer.append("\n");
        }
    }
    String name = md.get(Metadata.RESOURCE_NAME_KEY);
    if (name != null && name.length() > 0) {
        setTitle("Apache Tika: " + name);
    } else {
        setTitle("Apache Tika: unnamed document");
    }
    setText(metadata, metadataBuffer.toString());
    setText(xml, xmlBuffer.toString());
    setText(text, textBuffer.toString());
    setText(textMain, textMainBuffer.toString());
    setText(html, htmlBuffer.toString());
    if (!input.markSupported()) {
        setText(json, "InputStream does not support mark/reset for Recursive Parsing");
        layout.show(cards, "metadata");
        return;
    }
    boolean isReset = false;
    try {
        input.reset();
        isReset = true;
    } catch (IOException e) {
        setText(json, "Error during stream reset.\n" + "There's a limit of " + MAX_MARK + " bytes for this type of processing in the GUI.\n" + "Try the app with command line argument of -J.");
    }
    if (isReset) {
        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
        wrapper.parse(input, null, new Metadata(), new ParseContext());
        StringWriter jsonBuffer = new StringWriter();
        JsonMetadataList.setPrettyPrinting(true);
        JsonMetadataList.toJson(wrapper.getMetadata(), jsonBuffer);
        setText(json, jsonBuffer.toString());
    }
    layout.show(cards, "metadata");
}
Also used : ProgressMonitorInputStream(javax.swing.ProgressMonitorInputStream) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) IOException(java.io.IOException) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) StringWriter(java.io.StringWriter) ParseContext(org.apache.tika.parser.ParseContext) TeeContentHandler(org.apache.tika.sax.TeeContentHandler)

Example 19 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class RTFParserTest method testRegularImages.

//TIKA-1010 test regular (not "embedded") images/picts
@Test
public void testRegularImages() throws Exception {
    Parser base = new AutoDetectParser();
    ParseContext ctx = new ParseContext();
    RecursiveParserWrapper parser = new RecursiveParserWrapper(base, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
    ContentHandler handler = new BodyContentHandler();
    Metadata rootMetadata = new Metadata();
    rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
    try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) {
        parser.parse(tis, handler, rootMetadata, ctx);
    }
    List<Metadata> metadatas = parser.getMetadata();
    //("testJPEG_EXIF_普林斯顿.jpg");
    Metadata meta_jpg_exif = metadatas.get(1);
    //("testJPEG_普林斯顿.jpg");
    Metadata meta_jpg = metadatas.get(3);
    assertTrue(meta_jpg_exif != null);
    assertTrue(meta_jpg != null);
    assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor"));
    assertTrue(meta_jpg.get("Comments").contains("Licensed to the Apache"));
    //make sure old metadata doesn't linger between objects
    assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor"));
    assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
    assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
    assertEquals(49, meta_jpg.names().length);
    assertEquals(113, meta_jpg_exif.names().length);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) RTFMetadata(org.apache.tika.metadata.RTFMetadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TikaInputStream(org.apache.tika.io.TikaInputStream) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) WriteOutContentHandler(org.apache.tika.sax.WriteOutContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 20 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class RecursiveMetadataResource method parseMetadata.

private MetadataList parseMetadata(InputStream is, MultivaluedMap<String, String> httpHeaders, UriInfo info, String handlerTypeName) throws Exception {
    final Metadata metadata = new Metadata();
    final ParseContext context = new ParseContext();
    Parser parser = TikaResource.createParser();
    // TODO: parameterize choice of max chars/max embedded attachments
    BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, new BasicContentHandlerFactory(type, -1));
    TikaResource.fillMetadata(parser, metadata, context, httpHeaders);
    // no need to add parser to parse recursively
    TikaResource.fillParseContext(context, httpHeaders, null);
    TikaResource.logRequest(LOG, info, metadata);
    TikaResource.parse(wrapper, LOG, info.getPath(), is, new LanguageHandler() {

        public void endDocument() {
            metadata.set("language", getLanguage().getLanguage());
        }
    }, metadata, context);
    return new MetadataList(wrapper.getMetadata());
}
Also used : MetadataList(org.apache.tika.server.MetadataList) LanguageHandler(org.apache.tika.language.detect.LanguageHandler) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) Parser(org.apache.tika.parser.Parser)

Aggregations

BasicContentHandlerFactory (org.apache.tika.sax.BasicContentHandlerFactory)22 Metadata (org.apache.tika.metadata.Metadata)21 Test (org.junit.Test)16 InputStream (java.io.InputStream)10 TikaInputStream (org.apache.tika.io.TikaInputStream)9 RecursiveParserWrapper (org.apache.tika.parser.RecursiveParserWrapper)9 ParseContext (org.apache.tika.parser.ParseContext)8 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)7 Parser (org.apache.tika.parser.Parser)7 DefaultHandler (org.xml.sax.helpers.DefaultHandler)7 TikaTest (org.apache.tika.TikaTest)6 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)4 ByteArrayInputStream (java.io.ByteArrayInputStream)3 IOException (java.io.IOException)3 InputStreamReader (java.io.InputStreamReader)2 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)2 RecursiveParserWrapperFSConsumer (org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer)2 TikaConfig (org.apache.tika.config.TikaConfig)2 EmptyParser (org.apache.tika.parser.EmptyParser)2 ContentHandler (org.xml.sax.ContentHandler)2