Search in sources :

Example 11 with Metadata

use of org.apache.tika.metadata.Metadata in project jackrabbit by apache.

the class NodeIndexer method addBinaryValue.

/**
     * Adds the binary value to the document as the named field.
     * <p>
     * This implementation checks if this {@link #node} is of type nt:resource
     * and if that is the case, tries to extract text from the binary property
     * using the {@link #parser}.
     *
     * @param doc           The document to which to add the field
     * @param fieldName     The name of the field to add
     * @param internalValue The value for the field to add to the document.
     */
protected void addBinaryValue(Document doc, String fieldName, InternalValue internalValue) {
    // 'check' if node is of type nt:resource
    try {
        String jcrData = mappings.getPrefix(Name.NS_JCR_URI) + ":data";
        if (!jcrData.equals(fieldName)) {
            // don't know how to index
            return;
        }
        InternalValue type = getValue(NameConstants.JCR_MIMETYPE);
        if (type != null && isSupportedMediaType(type.getString())) {
            Metadata metadata = new Metadata();
            metadata.set(Metadata.CONTENT_TYPE, type.getString());
            // jcr:encoding is not mandatory
            InternalValue encoding = getValue(NameConstants.JCR_ENCODING);
            if (encoding != null) {
                metadata.set(Metadata.CONTENT_ENCODING, encoding.getString());
            }
            doc.add(createFulltextField(internalValue, metadata, false));
        }
    } catch (Throwable t) {
        // TODO: How to recover from a transient indexing failure?
        log.warn("Exception while indexing binary property", t);
    }
}
Also used : Metadata(org.apache.tika.metadata.Metadata) InternalValue(org.apache.jackrabbit.core.value.InternalValue)

Example 12 with Metadata

use of org.apache.tika.metadata.Metadata in project jackrabbit-oak by apache.

the class SolrIndexEditor method extractTextValues.

private List<String> extractTextValues(PropertyState property, NodeState state) {
    List<String> values = new LinkedList<String>();
    Metadata metadata = new Metadata();
    if (JCR_DATA.equals(property.getName())) {
        String type = state.getString(JcrConstants.JCR_MIMETYPE);
        if (type != null) {
            // not mandatory
            metadata.set(Metadata.CONTENT_TYPE, type);
        }
        String encoding = state.getString(JcrConstants.JCR_ENCODING);
        if (encoding != null) {
            // not mandatory
            metadata.set(Metadata.CONTENT_ENCODING, encoding);
        }
    }
    for (Blob v : property.getValue(Type.BINARIES)) {
        values.add(parseStringValue(v, metadata));
    }
    return values;
}
Also used : Blob(org.apache.jackrabbit.oak.api.Blob) Metadata(org.apache.tika.metadata.Metadata) LinkedList(java.util.LinkedList)

Example 13 with Metadata

use of org.apache.tika.metadata.Metadata in project lucene-solr by apache.

the class TikaEntityProcessor method nextRow.

@Override
public Map<String, Object> nextRow() {
    if (done)
        return null;
    Map<String, Object> row = new HashMap<>();
    DataSource<InputStream> dataSource = context.getDataSource();
    InputStream is = dataSource.getData(context.getResolvedEntityAttribute(URL));
    ContentHandler contentHandler = null;
    Metadata metadata = new Metadata();
    StringWriter sw = new StringWriter();
    try {
        if ("html".equals(format)) {
            contentHandler = getHtmlHandler(sw);
        } else if ("xml".equals(format)) {
            contentHandler = getXmlContentHandler(sw);
        } else if ("text".equals(format)) {
            contentHandler = getTextContentHandler(sw);
        } else if ("none".equals(format)) {
            contentHandler = new DefaultHandler();
        }
    } catch (TransformerConfigurationException e) {
        wrapAndThrow(SEVERE, e, "Unable to create content handler");
    }
    Parser tikaParser = null;
    if (parser.equals(AUTO_PARSER)) {
        tikaParser = new AutoDetectParser(tikaConfig);
    } else {
        tikaParser = context.getSolrCore().getResourceLoader().newInstance(parser, Parser.class);
    }
    try {
        ParseContext context = new ParseContext();
        if ("identity".equals(htmlMapper)) {
            context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
        }
        if (extractEmbedded) {
            context.set(Parser.class, tikaParser);
        }
        tikaParser.parse(is, contentHandler, metadata, context);
    } catch (Exception e) {
        if (SKIP.equals(onError)) {
            throw new DataImportHandlerException(DataImportHandlerException.SKIP_ROW, "Document skipped :" + e.getMessage());
        }
        wrapAndThrow(SEVERE, e, "Unable to read content");
    }
    IOUtils.closeQuietly(is);
    for (Map<String, String> field : context.getAllEntityFields()) {
        if (!"true".equals(field.get("meta")))
            continue;
        String col = field.get(COLUMN);
        String s = metadata.get(col);
        if (s != null)
            row.put(col, s);
    }
    if (!"none".equals(format))
        row.put("text", sw.toString());
    tryToAddLatLon(metadata, row);
    done = true;
    return row;
}
Also used : TransformerConfigurationException(javax.xml.transform.TransformerConfigurationException) HashMap(java.util.HashMap) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) TransformerConfigurationException(javax.xml.transform.TransformerConfigurationException) SAXException(org.xml.sax.SAXException) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) StringWriter(java.io.StringWriter) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Example 14 with Metadata

use of org.apache.tika.metadata.Metadata in project lucene-solr by apache.

the class ExtractingDocumentLoader method load.

@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception {
    Parser parser = null;
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
        //Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
        MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
        parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
    } else {
        parser = autoDetectParser;
    }
    if (parser != null) {
        Metadata metadata = new Metadata();
        // If you specify the resource name (the filename, roughly) with this parameter,
        // then Tika can make use of it in guessing the appropriate MIME type:
        String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
        if (resourceName != null) {
            metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
        }
        // Provide stream's content type as hint for auto detection
        if (stream.getContentType() != null) {
            metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
        }
        InputStream inputStream = null;
        try {
            inputStream = stream.getStream();
            metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
            metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
            metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
            metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
            // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
            String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
            if (charset != null) {
                metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
            }
            String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
            boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
            SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema());
            ContentHandler parsingHandler = handler;
            StringWriter writer = null;
            BaseMarkupSerializer serializer = null;
            if (extractOnly == true) {
                String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
                writer = new StringWriter();
                if (extractFormat.equals(TEXT_FORMAT)) {
                    serializer = new TextSerializer();
                    serializer.setOutputCharStream(writer);
                    serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
                } else {
                    serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
                }
                if (xpathExpr != null) {
                    Matcher matcher = PARSER.parse(xpathExpr);
                    //The MatchingContentHandler does not invoke startDocument.  See http://tika.markmail.org/message/kknu3hw7argwiqin
                    serializer.startDocument();
                    parsingHandler = new MatchingContentHandler(serializer, matcher);
                } else {
                    parsingHandler = serializer;
                }
            } else if (xpathExpr != null) {
                Matcher matcher = PARSER.parse(xpathExpr);
                parsingHandler = new MatchingContentHandler(handler, matcher);
            }
            try {
                //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
                ParseContext context = parseContextConfig.create();
                context.set(Parser.class, parser);
                context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
                // Password handling
                RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
                String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
                if (pwMapFile != null && pwMapFile.length() > 0) {
                    InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
                    if (is != null) {
                        log.debug("Password file supplied: " + pwMapFile);
                        epp.parse(is);
                    }
                }
                context.set(PasswordProvider.class, epp);
                String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
                if (resourcePassword != null) {
                    epp.setExplicitPassword(resourcePassword);
                    log.debug("Literal password supplied for file " + resourceName);
                }
                parser.parse(inputStream, parsingHandler, metadata, context);
            } catch (TikaException e) {
                if (ignoreTikaException)
                    log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage()).append(". metadata=").append(metadata.toString()).toString());
                else
                    throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
            }
            if (extractOnly == false) {
                addDoc(handler);
            } else {
                //serializer is not null, so we need to call endDoc on it if using xpath
                if (xpathExpr != null) {
                    serializer.endDocument();
                }
                rsp.add(stream.getName(), writer.toString());
                writer.close();
                String[] names = metadata.names();
                NamedList metadataNL = new NamedList();
                for (int i = 0; i < names.length; i++) {
                    String[] vals = metadata.getValues(names[i]);
                    metadataNL.add(names[i], vals);
                }
                rsp.add(stream.getName() + "_metadata", metadataNL);
            }
        } catch (SAXException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        } finally {
            IOUtils.closeQuietly(inputStream);
        }
    } else {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers.  Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
    }
}
Also used : Matcher(org.apache.tika.sax.xpath.Matcher) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) Metadata(org.apache.tika.metadata.Metadata) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) SAXException(org.xml.sax.SAXException) StringWriter(java.io.StringWriter) MediaType(org.apache.tika.mime.MediaType) SolrException(org.apache.solr.common.SolrException) DefaultParser(org.apache.tika.parser.DefaultParser) XMLSerializer(org.apache.xml.serialize.XMLSerializer) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) NamedList(org.apache.solr.common.util.NamedList) BaseMarkupSerializer(org.apache.xml.serialize.BaseMarkupSerializer) OutputFormat(org.apache.xml.serialize.OutputFormat) Parser(org.apache.tika.parser.Parser) XPathParser(org.apache.tika.sax.xpath.XPathParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultParser(org.apache.tika.parser.DefaultParser) TextSerializer(org.apache.xml.serialize.TextSerializer) ParseContext(org.apache.tika.parser.ParseContext)

Example 15 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class TikaTest method getRecursiveMetadata.

protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
    Parser p = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, new DefaultHandler(), new Metadata(), context);
    }
    return wrapper.getMetadata();
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)643 Test (org.junit.Test)467 InputStream (java.io.InputStream)318 ParseContext (org.apache.tika.parser.ParseContext)281 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)268 TikaTest (org.apache.tika.TikaTest)257 ContentHandler (org.xml.sax.ContentHandler)228 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)151 ByteArrayInputStream (java.io.ByteArrayInputStream)141 Parser (org.apache.tika.parser.Parser)134 TikaInputStream (org.apache.tika.io.TikaInputStream)131 IOException (java.io.IOException)62 DefaultHandler (org.xml.sax.helpers.DefaultHandler)59 TikaException (org.apache.tika.exception.TikaException)46 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)36 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)36 StringWriter (java.io.StringWriter)33 Tika (org.apache.tika.Tika)28 FileInputStream (java.io.FileInputStream)27 MediaType (org.apache.tika.mime.MediaType)27