Search in sources :

Example 86 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class TikaResource method createParser.

@SuppressWarnings("serial")
public static Parser createParser() {
    final Parser parser = new AutoDetectParser(tikaConfig);
    Map<MediaType, Parser> parsers = ((AutoDetectParser) parser).getParsers();
    parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
    ((AutoDetectParser) parser).setParsers(parsers);
    ((AutoDetectParser) parser).setFallback(new Parser() {

        public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
            return parser.getSupportedTypes(parseContext);
        }

        public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) {
            throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
        }
    });
    if (digester != null) {
        return new DigestingParser(parser, digester);
    }
    return parser;
}
Also used : HtmlParser(org.apache.tika.parser.html.HtmlParser) Set(java.util.Set) WebApplicationException(javax.ws.rs.WebApplicationException) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) DigestingParser(org.apache.tika.parser.DigestingParser) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) ExpandedTitleContentHandler(org.apache.tika.sax.ExpandedTitleContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser)

Example 87 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class TikaResource method fillMetadata.

@SuppressWarnings("serial")
public static void fillMetadata(Parser parser, Metadata metadata, ParseContext context, MultivaluedMap<String, String> httpHeaders) {
    String fileName = detectFilename(httpHeaders);
    if (fileName != null) {
        metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
    }
    String contentTypeHeader = httpHeaders.getFirst(HttpHeaders.CONTENT_TYPE);
    javax.ws.rs.core.MediaType mediaType = contentTypeHeader == null ? null : javax.ws.rs.core.MediaType.valueOf(contentTypeHeader);
    if (mediaType != null && "xml".equals(mediaType.getSubtype())) {
        mediaType = null;
    }
    if (mediaType != null && mediaType.equals(javax.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE)) {
        mediaType = null;
    }
    if (mediaType != null) {
        metadata.add(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE, mediaType.toString());
        final Detector detector = getDetector(parser);
        setDetector(parser, new Detector() {

            public MediaType detect(InputStream inputStream, Metadata metadata) throws IOException {
                String ct = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
                //make sure never to return null -- TIKA-1845
                MediaType type = null;
                if (ct != null) {
                    //this can return null if ct is not a valid mime type
                    type = MediaType.parse(ct);
                }
                if (type != null) {
                    return type;
                } else {
                    return detector.detect(inputStream, metadata);
                }
            }
        });
    }
    final String password = httpHeaders.getFirst("Password");
    if (password != null) {
        context.set(PasswordProvider.class, new PasswordProvider() {

            @Override
            public String getPassword(Metadata metadata) {
                return password;
            }
        });
    }
}
Also used : Detector(org.apache.tika.detect.Detector) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType) IOException(java.io.IOException) PasswordProvider(org.apache.tika.parser.PasswordProvider)

Example 88 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class TikaResource method produceOutput.

private StreamingOutput produceOutput(final InputStream is, final MultivaluedMap<String, String> httpHeaders, final UriInfo info, final String format) {
    final Parser parser = createParser();
    final Metadata metadata = new Metadata();
    final ParseContext context = new ParseContext();
    fillMetadata(parser, metadata, context, httpHeaders);
    fillParseContext(context, httpHeaders, parser);
    logRequest(LOG, info, metadata);
    return new StreamingOutput() {

        public void write(OutputStream outputStream) throws IOException, WebApplicationException {
            Writer writer = new OutputStreamWriter(outputStream, UTF_8);
            ContentHandler content;
            try {
                SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
                TransformerHandler handler = factory.newTransformerHandler();
                handler.getTransformer().setOutputProperty(OutputKeys.METHOD, format);
                handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
                handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, UTF_8.name());
                handler.setResult(new StreamResult(writer));
                content = new ExpandedTitleContentHandler(handler);
            } catch (TransformerConfigurationException e) {
                throw new WebApplicationException(e);
            }
            parse(parser, LOG, info.getPath(), is, content, metadata, context);
        }
    };
}
Also used : TransformerHandler(javax.xml.transform.sax.TransformerHandler) StreamResult(javax.xml.transform.stream.StreamResult) TransformerConfigurationException(javax.xml.transform.TransformerConfigurationException) WebApplicationException(javax.ws.rs.WebApplicationException) OutputStream(java.io.OutputStream) Metadata(org.apache.tika.metadata.Metadata) SAXTransformerFactory(javax.xml.transform.sax.SAXTransformerFactory) StreamingOutput(javax.ws.rs.core.StreamingOutput) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) ExpandedTitleContentHandler(org.apache.tika.sax.ExpandedTitleContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) ExpandedTitleContentHandler(org.apache.tika.sax.ExpandedTitleContentHandler) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser) ParseContext(org.apache.tika.parser.ParseContext) OutputStreamWriter(java.io.OutputStreamWriter) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter)

Example 89 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class UnpackerResource method process.

private Map<String, byte[]> process(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info, boolean saveAll) throws Exception {
    Metadata metadata = new Metadata();
    ParseContext pc = new ParseContext();
    Parser parser = TikaResource.createParser();
    if (parser instanceof DigestingParser) {
        //no need to digest for unwrapping
        parser = ((DigestingParser) parser).getWrappedParser();
    }
    TikaResource.fillMetadata(parser, metadata, pc, httpHeaders.getRequestHeaders());
    TikaResource.logRequest(LOG, info, metadata);
    ContentHandler ch;
    ByteArrayOutputStream text = new ByteArrayOutputStream();
    if (saveAll) {
        ch = new BodyContentHandler(new RichTextContentHandler(new OutputStreamWriter(text, UTF_8)));
    } else {
        ch = new DefaultHandler();
    }
    Map<String, byte[]> files = new HashMap<>();
    MutableInt count = new MutableInt();
    pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files));
    TikaResource.parse(parser, LOG, info.getPath(), is, ch, metadata, pc);
    if (count.intValue() == 0 && !saveAll) {
        throw new WebApplicationException(Response.Status.NO_CONTENT);
    }
    if (saveAll) {
        files.put(TEXT_FILENAME, text.toByteArray());
        ByteArrayOutputStream metaStream = new ByteArrayOutputStream();
        metadataToCsv(metadata, metaStream);
        files.put(META_FILENAME, metaStream.toByteArray());
    }
    return files;
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) WebApplicationException(javax.ws.rs.WebApplicationException) HashMap(java.util.HashMap) Metadata(org.apache.tika.metadata.Metadata) DigestingParser(org.apache.tika.parser.DigestingParser) ByteArrayOutputStream(java.io.ByteArrayOutputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) DigestingParser(org.apache.tika.parser.DigestingParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) MutableInt(org.apache.commons.lang.mutable.MutableInt) ParseContext(org.apache.tika.parser.ParseContext) OutputStreamWriter(java.io.OutputStreamWriter)

Example 90 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class RecursiveMetadataResourceTest method testHandlerTypeInMultipartXML.

@Test
public void testHandlerTypeInMultipartXML() throws Exception {
    //default unspecified
    Attachment attachmentPart = new Attachment("myworddocx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
    WebClient webClient = WebClient.create(endPoint + META_PATH + FORM_PATH);
    Response response = webClient.type("multipart/form-data").accept("application/json").post(attachmentPart);
    Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
    List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
    assertEquals(12, metadataList.size());
    String content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
    assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
    //unparseable
    attachmentPart = new Attachment("myworddocx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
    webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + UNPARSEABLE_PATH);
    response = webClient.type("multipart/form-data").accept("application/json").post(attachmentPart);
    reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
    metadataList = JsonMetadataList.fromJson(reader);
    assertEquals(12, metadataList.size());
    content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
    assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
    //xml
    attachmentPart = new Attachment("myworddocx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
    webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + XML_PATH);
    response = webClient.type("multipart/form-data").accept("application/json").post(attachmentPart);
    reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
    metadataList = JsonMetadataList.fromJson(reader);
    assertEquals(12, metadataList.size());
    content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
    assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
    //text
    attachmentPart = new Attachment("myworddocx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
    webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + TEXT_PATH);
    response = webClient.type("multipart/form-data").accept("application/json").post(attachmentPart);
    reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
    metadataList = JsonMetadataList.fromJson(reader);
    assertEquals(12, metadataList.size());
    content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
    assertTrue(content.startsWith("embed_3"));
    //ignore -- no content
    attachmentPart = new Attachment("myworddocx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
    webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + IGNORE_PATH);
    response = webClient.type("multipart/form-data").accept("application/json").query("handler", "ignore").post(attachmentPart);
    reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
    metadataList = JsonMetadataList.fromJson(reader);
    assertEquals(12, metadataList.size());
    assertNull(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT));
}
Also used : Response(javax.ws.rs.core.Response) InputStreamReader(java.io.InputStreamReader) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) Attachment(org.apache.cxf.jaxrs.ext.multipart.Attachment) WebClient(org.apache.cxf.jaxrs.client.WebClient) Test(org.junit.Test)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)651 Test (org.junit.Test)467 InputStream (java.io.InputStream)320 ParseContext (org.apache.tika.parser.ParseContext)283 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)269 TikaTest (org.apache.tika.TikaTest)257 ContentHandler (org.xml.sax.ContentHandler)229 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)154 ByteArrayInputStream (java.io.ByteArrayInputStream)143 Parser (org.apache.tika.parser.Parser)136 TikaInputStream (org.apache.tika.io.TikaInputStream)133 IOException (java.io.IOException)66 DefaultHandler (org.xml.sax.helpers.DefaultHandler)59 TikaException (org.apache.tika.exception.TikaException)48 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)36 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)36 StringWriter (java.io.StringWriter)33 Tika (org.apache.tika.Tika)29 MediaType (org.apache.tika.mime.MediaType)29 SAXException (org.xml.sax.SAXException)29