Search in sources :

Example 66 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class ForkParserIntegrationTest method testAttachingADebuggerOnTheForkedParserShouldWork.

/**
     * TIKA-832
     */
@Test
public void testAttachingADebuggerOnTheForkedParserShouldWork() throws Exception {
    ParseContext context = new ParseContext();
    context.set(Parser.class, tika.getParser());
    ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser());
    parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug", "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n"));
    try {
        ContentHandler body = new BodyContentHandler();
        InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
        parser.parse(stream, body, new Metadata(), context);
        String content = body.toString();
        assertContains("Test d'indexation", content);
        assertContains("http://www.apache.org", content);
    } finally {
        parser.close();
    }
}
Also used : ForkParser(org.apache.tika.fork.ForkParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 67 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class TikaResource method createParser.

@SuppressWarnings("serial")
public static Parser createParser() {
    final Parser parser = new AutoDetectParser(tikaConfig);
    Map<MediaType, Parser> parsers = ((AutoDetectParser) parser).getParsers();
    parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
    ((AutoDetectParser) parser).setParsers(parsers);
    ((AutoDetectParser) parser).setFallback(new Parser() {

        public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
            return parser.getSupportedTypes(parseContext);
        }

        public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) {
            throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
        }
    });
    if (digester != null) {
        return new DigestingParser(parser, digester);
    }
    return parser;
}
Also used : HtmlParser(org.apache.tika.parser.html.HtmlParser) Set(java.util.Set) WebApplicationException(javax.ws.rs.WebApplicationException) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) DigestingParser(org.apache.tika.parser.DigestingParser) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) ExpandedTitleContentHandler(org.apache.tika.sax.ExpandedTitleContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser)

Example 68 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class TikaResource method produceOutput.

private StreamingOutput produceOutput(final InputStream is, final MultivaluedMap<String, String> httpHeaders, final UriInfo info, final String format) {
    final Parser parser = createParser();
    final Metadata metadata = new Metadata();
    final ParseContext context = new ParseContext();
    fillMetadata(parser, metadata, context, httpHeaders);
    fillParseContext(context, httpHeaders, parser);
    logRequest(LOG, info, metadata);
    return new StreamingOutput() {

        public void write(OutputStream outputStream) throws IOException, WebApplicationException {
            Writer writer = new OutputStreamWriter(outputStream, UTF_8);
            ContentHandler content;
            try {
                SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
                TransformerHandler handler = factory.newTransformerHandler();
                handler.getTransformer().setOutputProperty(OutputKeys.METHOD, format);
                handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
                handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, UTF_8.name());
                handler.setResult(new StreamResult(writer));
                content = new ExpandedTitleContentHandler(handler);
            } catch (TransformerConfigurationException e) {
                throw new WebApplicationException(e);
            }
            parse(parser, LOG, info.getPath(), is, content, metadata, context);
        }
    };
}
Also used : TransformerHandler(javax.xml.transform.sax.TransformerHandler) StreamResult(javax.xml.transform.stream.StreamResult) TransformerConfigurationException(javax.xml.transform.TransformerConfigurationException) WebApplicationException(javax.ws.rs.WebApplicationException) OutputStream(java.io.OutputStream) Metadata(org.apache.tika.metadata.Metadata) SAXTransformerFactory(javax.xml.transform.sax.SAXTransformerFactory) StreamingOutput(javax.ws.rs.core.StreamingOutput) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) ExpandedTitleContentHandler(org.apache.tika.sax.ExpandedTitleContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) ExpandedTitleContentHandler(org.apache.tika.sax.ExpandedTitleContentHandler) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser) ParseContext(org.apache.tika.parser.ParseContext) OutputStreamWriter(java.io.OutputStreamWriter) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter)

Example 69 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class UnpackerResource method process.

private Map<String, byte[]> process(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info, boolean saveAll) throws Exception {
    Metadata metadata = new Metadata();
    ParseContext pc = new ParseContext();
    Parser parser = TikaResource.createParser();
    if (parser instanceof DigestingParser) {
        //no need to digest for unwrapping
        parser = ((DigestingParser) parser).getWrappedParser();
    }
    TikaResource.fillMetadata(parser, metadata, pc, httpHeaders.getRequestHeaders());
    TikaResource.logRequest(LOG, info, metadata);
    ContentHandler ch;
    ByteArrayOutputStream text = new ByteArrayOutputStream();
    if (saveAll) {
        ch = new BodyContentHandler(new RichTextContentHandler(new OutputStreamWriter(text, UTF_8)));
    } else {
        ch = new DefaultHandler();
    }
    Map<String, byte[]> files = new HashMap<>();
    MutableInt count = new MutableInt();
    pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files));
    TikaResource.parse(parser, LOG, info.getPath(), is, ch, metadata, pc);
    if (count.intValue() == 0 && !saveAll) {
        throw new WebApplicationException(Response.Status.NO_CONTENT);
    }
    if (saveAll) {
        files.put(TEXT_FILENAME, text.toByteArray());
        ByteArrayOutputStream metaStream = new ByteArrayOutputStream();
        metadataToCsv(metadata, metaStream);
        files.put(META_FILENAME, metaStream.toByteArray());
    }
    return files;
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) WebApplicationException(javax.ws.rs.WebApplicationException) HashMap(java.util.HashMap) Metadata(org.apache.tika.metadata.Metadata) DigestingParser(org.apache.tika.parser.DigestingParser) ByteArrayOutputStream(java.io.ByteArrayOutputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) DigestingParser(org.apache.tika.parser.DigestingParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) MutableInt(org.apache.commons.lang.mutable.MutableInt) ParseContext(org.apache.tika.parser.ParseContext) OutputStreamWriter(java.io.OutputStreamWriter)

Example 70 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class SXSLFExtractorTest method testPowerPoint.

/**
     * We have a number of different powerpoint files,
     * such as presentation, macro-enabled etc
     */
@Test
public void testPowerPoint() throws Exception {
    String[] extensions = new String[] { "pptx", "pptm", "ppsm", "ppsx", "potm" };
    String[] mimeTypes = new String[] { "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint.presentation.macroenabled.12", "application/vnd.ms-powerpoint.slideshow.macroenabled.12", "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "application/vnd.ms-powerpoint.template.macroenabled.12" };
    for (int i = 0; i < extensions.length; i++) {
        String extension = extensions[i];
        String filename = "testPPT." + extension;
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        try (InputStream input = getResourceAsStream("/test-documents/" + filename)) {
            parser.parse(input, handler, metadata, parseContext);
            assertEquals("Mime-type checking for " + filename, mimeTypes[i], metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
            assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
            assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
            String content = handler.toString();
            // Theme files don't have the text in them
            if (extension.equals("thmx")) {
                assertEquals("", content);
            } else {
                assertTrue("Text missing for " + filename + "\n" + content, content.contains("Attachment Test"));
                assertTrue("Text missing for " + filename + "\n" + content, content.contains("This is a test file data with the same content"));
                assertTrue("Text missing for " + filename + "\n" + content, content.contains("content parsing"));
                assertTrue("Text missing for " + filename + "\n" + content, content.contains("Different words to test against"));
                assertTrue("Text missing for " + filename + "\n" + content, content.contains("Mystery"));
            }
        }
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

ContentHandler (org.xml.sax.ContentHandler)354 Metadata (org.apache.tika.metadata.Metadata)229 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)229 InputStream (java.io.InputStream)210 Test (org.junit.Test)208 ParseContext (org.apache.tika.parser.ParseContext)164 Parser (org.apache.tika.parser.Parser)106 TikaTest (org.apache.tika.TikaTest)103 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)102 TikaInputStream (org.apache.tika.io.TikaInputStream)75 ByteArrayInputStream (java.io.ByteArrayInputStream)64 SAXException (org.xml.sax.SAXException)40 IOException (java.io.IOException)34 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)28 TikaException (org.apache.tika.exception.TikaException)24 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)21 AttributesImpl (org.xml.sax.helpers.AttributesImpl)21 InputSource (org.xml.sax.InputSource)20