Search in sources :

Example 76 with NamedList

use of org.apache.solr.common.util.NamedList in project lucene-solr by apache.

the class SolrPluginUtils method copyNamedListIntoArrayByDocPosInResponse.

/** Copies the given {@code namedList} assumed to have doc uniqueKey keyed data into {@code destArr}
   * at the position of the document in the response.  destArr is assumed to be the same size as
   * {@code resultIds} is.  {@code resultIds} comes from {@link ResponseBuilder#resultIds}.  If the doc key
   * isn't in {@code resultIds} then it is ignored.
   * Note: most likely you will call {@link #removeNulls(Map.Entry[], NamedList)} sometime after calling this. */
public static void copyNamedListIntoArrayByDocPosInResponse(NamedList namedList, Map<Object, ShardDoc> resultIds, Map.Entry<String, Object>[] destArr) {
    assert resultIds.size() == destArr.length;
    for (int i = 0; i < namedList.size(); i++) {
        String id = namedList.getName(i);
        // TODO: lookup won't work for non-string ids... String vs Float
        ShardDoc sdoc = resultIds.get(id);
        if (sdoc != null) {
            // maybe null when rb.onePassDistributedQuery
            int idx = sdoc.positionInResponse;
            destArr[idx] = new NamedList.NamedListEntry<>(id, namedList.getVal(i));
        }
    }
}
Also used : NamedList(org.apache.solr.common.util.NamedList) ShardDoc(org.apache.solr.handler.component.ShardDoc)

Example 77 with NamedList

use of org.apache.solr.common.util.NamedList in project lucene-solr by apache.

the class SolrPluginUtils method explanationToNamedList.

public static NamedList<Object> explanationToNamedList(Explanation e) {
    NamedList<Object> out = new SimpleOrderedMap<>();
    out.add("match", e.isMatch());
    out.add("value", e.getValue());
    out.add("description", e.getDescription());
    Explanation[] details = e.getDetails();
    // short circut out
    if (0 == details.length)
        return out;
    List<NamedList<Object>> kids = new ArrayList<>(details.length);
    for (Explanation d : details) {
        kids.add(explanationToNamedList(d));
    }
    out.add("details", kids);
    return out;
}
Also used : Explanation(org.apache.lucene.search.Explanation) NamedList(org.apache.solr.common.util.NamedList) ArrayList(java.util.ArrayList) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap)

Example 78 with NamedList

use of org.apache.solr.common.util.NamedList in project lucene-solr by apache.

the class ExtractingDocumentLoader method load.

@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception {
    Parser parser = null;
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
        //Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
        MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
        parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
    } else {
        parser = autoDetectParser;
    }
    if (parser != null) {
        Metadata metadata = new Metadata();
        // If you specify the resource name (the filename, roughly) with this parameter,
        // then Tika can make use of it in guessing the appropriate MIME type:
        String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
        if (resourceName != null) {
            metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
        }
        // Provide stream's content type as hint for auto detection
        if (stream.getContentType() != null) {
            metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
        }
        InputStream inputStream = null;
        try {
            inputStream = stream.getStream();
            metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
            metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
            metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
            metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
            // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
            String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
            if (charset != null) {
                metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
            }
            String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
            boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
            SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema());
            ContentHandler parsingHandler = handler;
            StringWriter writer = null;
            BaseMarkupSerializer serializer = null;
            if (extractOnly == true) {
                String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
                writer = new StringWriter();
                if (extractFormat.equals(TEXT_FORMAT)) {
                    serializer = new TextSerializer();
                    serializer.setOutputCharStream(writer);
                    serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
                } else {
                    serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
                }
                if (xpathExpr != null) {
                    Matcher matcher = PARSER.parse(xpathExpr);
                    //The MatchingContentHandler does not invoke startDocument.  See http://tika.markmail.org/message/kknu3hw7argwiqin
                    serializer.startDocument();
                    parsingHandler = new MatchingContentHandler(serializer, matcher);
                } else {
                    parsingHandler = serializer;
                }
            } else if (xpathExpr != null) {
                Matcher matcher = PARSER.parse(xpathExpr);
                parsingHandler = new MatchingContentHandler(handler, matcher);
            }
            try {
                //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
                ParseContext context = parseContextConfig.create();
                context.set(Parser.class, parser);
                context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
                // Password handling
                RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
                String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
                if (pwMapFile != null && pwMapFile.length() > 0) {
                    InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
                    if (is != null) {
                        log.debug("Password file supplied: " + pwMapFile);
                        epp.parse(is);
                    }
                }
                context.set(PasswordProvider.class, epp);
                String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
                if (resourcePassword != null) {
                    epp.setExplicitPassword(resourcePassword);
                    log.debug("Literal password supplied for file " + resourceName);
                }
                parser.parse(inputStream, parsingHandler, metadata, context);
            } catch (TikaException e) {
                if (ignoreTikaException)
                    log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage()).append(". metadata=").append(metadata.toString()).toString());
                else
                    throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
            }
            if (extractOnly == false) {
                addDoc(handler);
            } else {
                //serializer is not null, so we need to call endDoc on it if using xpath
                if (xpathExpr != null) {
                    serializer.endDocument();
                }
                rsp.add(stream.getName(), writer.toString());
                writer.close();
                String[] names = metadata.names();
                NamedList metadataNL = new NamedList();
                for (int i = 0; i < names.length; i++) {
                    String[] vals = metadata.getValues(names[i]);
                    metadataNL.add(names[i], vals);
                }
                rsp.add(stream.getName() + "_metadata", metadataNL);
            }
        } catch (SAXException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        } finally {
            IOUtils.closeQuietly(inputStream);
        }
    } else {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers.  Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
    }
}
Also used : Matcher(org.apache.tika.sax.xpath.Matcher) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) Metadata(org.apache.tika.metadata.Metadata) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) SAXException(org.xml.sax.SAXException) StringWriter(java.io.StringWriter) MediaType(org.apache.tika.mime.MediaType) SolrException(org.apache.solr.common.SolrException) DefaultParser(org.apache.tika.parser.DefaultParser) XMLSerializer(org.apache.xml.serialize.XMLSerializer) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) NamedList(org.apache.solr.common.util.NamedList) BaseMarkupSerializer(org.apache.xml.serialize.BaseMarkupSerializer) OutputFormat(org.apache.xml.serialize.OutputFormat) Parser(org.apache.tika.parser.Parser) XPathParser(org.apache.tika.sax.xpath.XPathParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultParser(org.apache.tika.parser.DefaultParser) TextSerializer(org.apache.xml.serialize.TextSerializer) ParseContext(org.apache.tika.parser.ParseContext)

Example 79 with NamedList

use of org.apache.solr.common.util.NamedList in project lucene-solr by apache.

the class ExtractingRequestHandlerTest method testExtractOnly.

// Note: If you load a plain text file specifying neither MIME type nor filename, extraction will silently fail. This is because Tika's
// automatic MIME type detection will fail, and it will default to using an empty-string-returning default parser
@Test
public void testExtractOnly() throws Exception {
    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
    assertTrue("handler is null and it shouldn't be", handler != null);
    SolrQueryResponse rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true");
    assertTrue("rsp is null and it shouldn't be", rsp != null);
    NamedList list = rsp.getValues();
    String extraction = (String) list.get("solr-word.pdf");
    assertTrue("extraction is null and it shouldn't be", extraction != null);
    assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
    NamedList nl = (NamedList) list.get("solr-word.pdf_metadata");
    assertTrue("nl is null and it shouldn't be", nl != null);
    Object title = nl.get("title");
    assertTrue("title is null and it shouldn't be", title != null);
    assertTrue(extraction.indexOf("<?xml") != -1);
    rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true", ExtractingParams.EXTRACT_FORMAT, ExtractingDocumentLoader.TEXT_FORMAT);
    assertTrue("rsp is null and it shouldn't be", rsp != null);
    list = rsp.getValues();
    extraction = (String) list.get("solr-word.pdf");
    assertTrue("extraction is null and it shouldn't be", extraction != null);
    assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
    assertTrue(extraction.indexOf("<?xml") == -1);
    nl = (NamedList) list.get("solr-word.pdf_metadata");
    assertTrue("nl is null and it shouldn't be", nl != null);
    title = nl.get("title");
    assertTrue("title is null and it shouldn't be", title != null);
}
Also used : SolrQueryResponse(org.apache.solr.response.SolrQueryResponse) NamedList(org.apache.solr.common.util.NamedList) Test(org.junit.Test)

Example 80 with NamedList

use of org.apache.solr.common.util.NamedList in project lucene-solr by apache.

the class ExtractingRequestHandler method inform.

@Override
public void inform(SolrCore core) {
    if (initArgs != null) {
        //if relative,then relative to config dir, otherwise, absolute path
        String tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION);
        if (tikaConfigLoc != null) {
            File configFile = new File(tikaConfigLoc);
            if (configFile.isAbsolute() == false) {
                configFile = new File(core.getResourceLoader().getConfigDir(), configFile.getPath());
            }
            try {
                config = new TikaConfig(configFile);
            } catch (Exception e) {
                throw new SolrException(ErrorCode.SERVER_ERROR, e);
            }
        }
        String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG);
        if (parseContextConfigLoc != null) {
            try {
                parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc);
            } catch (Exception e) {
                throw new SolrException(ErrorCode.SERVER_ERROR, e);
            }
        }
        NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
        if (configDateFormats != null && configDateFormats.size() > 0) {
            dateFormats = new HashSet<>();
            Iterator<Map.Entry> it = configDateFormats.iterator();
            while (it.hasNext()) {
                String format = (String) it.next().getValue();
                log.info("Adding Date Format: " + format);
                dateFormats.add(format);
            }
        }
    }
    if (config == null) {
        try {
            config = getDefaultConfig(core.getResourceLoader().getClassLoader());
        } catch (MimeTypeException | IOException e) {
            throw new SolrException(ErrorCode.SERVER_ERROR, e);
        }
    }
    if (parseContextConfig == null) {
        parseContextConfig = new ParseContextConfig();
    }
    factory = createFactory();
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) NamedList(org.apache.solr.common.util.NamedList) IOException(java.io.IOException) IOException(java.io.IOException) SolrException(org.apache.solr.common.SolrException) MimeTypeException(org.apache.tika.mime.MimeTypeException) MimeTypeException(org.apache.tika.mime.MimeTypeException) File(java.io.File) SolrException(org.apache.solr.common.SolrException)

Aggregations

NamedList (org.apache.solr.common.util.NamedList)440 Test (org.junit.Test)125 ArrayList (java.util.ArrayList)111 Map (java.util.Map)83 ModifiableSolrParams (org.apache.solr.common.params.ModifiableSolrParams)83 SolrException (org.apache.solr.common.SolrException)80 SimpleOrderedMap (org.apache.solr.common.util.SimpleOrderedMap)79 List (java.util.List)76 HashMap (java.util.HashMap)65 SolrQueryResponse (org.apache.solr.response.SolrQueryResponse)55 IOException (java.io.IOException)53 SolrDocumentList (org.apache.solr.common.SolrDocumentList)45 QueryRequest (org.apache.solr.client.solrj.request.QueryRequest)35 SolrQueryRequest (org.apache.solr.request.SolrQueryRequest)35 SolrParams (org.apache.solr.common.params.SolrParams)31 LocalSolrQueryRequest (org.apache.solr.request.LocalSolrQueryRequest)31 QueryResponse (org.apache.solr.client.solrj.response.QueryResponse)30 SolrCore (org.apache.solr.core.SolrCore)30 HttpSolrClient (org.apache.solr.client.solrj.impl.HttpSolrClient)27 SolrIndexSearcher (org.apache.solr.search.SolrIndexSearcher)27