Search in sources :

Example 1 with Document

use of org.carrot2.core.Document in project lucene-solr by apache.

the class CarrotClusteringEngine method cluster.

@Override
public Object cluster(Query query, SolrDocumentList solrDocList, Map<SolrDocument, Integer> docIds, SolrQueryRequest sreq) {
    try {
        // Prepare attributes for Carrot2 clustering call
        Map<String, Object> attributes = new HashMap<>();
        List<Document> documents = getDocuments(solrDocList, docIds, query, sreq);
        attributes.put(AttributeNames.DOCUMENTS, documents);
        attributes.put(AttributeNames.QUERY, query.toString());
        // Pass the fields on which clustering runs.
        attributes.put("solrFieldNames", getFieldsForClustering(sreq));
        // Pass extra overriding attributes from the request, if any
        extractCarrotAttributes(sreq.getParams(), attributes);
        // Perform clustering and convert to an output structure of clusters.
        //
        // Carrot2 uses current thread's context class loader to get
        // certain classes (e.g. custom tokenizer/stemmer) at runtime.
        // To make sure classes from contrib JARs are available,
        // we swap the context class loader for the time of clustering.
        Thread ct = Thread.currentThread();
        ClassLoader prev = ct.getContextClassLoader();
        try {
            ct.setContextClassLoader(core.getResourceLoader().getClassLoader());
            return clustersToNamedList(controller.process(attributes, clusteringAlgorithmClass).getClusters(), sreq.getParams());
        } finally {
            ct.setContextClassLoader(prev);
        }
    } catch (Exception e) {
        log.error("Carrot2 clustering failed", e);
        throw new SolrException(ErrorCode.SERVER_ERROR, "Carrot2 clustering failed", e);
    }
}
Also used : HashMap(java.util.HashMap) Document(org.carrot2.core.Document) SolrDocument(org.apache.solr.common.SolrDocument) SolrException(org.apache.solr.common.SolrException) IOException(java.io.IOException) SolrException(org.apache.solr.common.SolrException)

Example 2 with Document

use of org.carrot2.core.Document in project lucene-solr by apache.

the class CarrotClusteringEngine method getDocuments.

/**
   * Prepares Carrot2 documents for clustering.
   */
private List<Document> getDocuments(SolrDocumentList solrDocList, Map<SolrDocument, Integer> docIds, Query query, final SolrQueryRequest sreq) throws IOException {
    SolrHighlighter highlighter = null;
    SolrParams solrParams = sreq.getParams();
    SolrCore core = sreq.getCore();
    String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
    String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
    String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
    String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME, null);
    // Maps Solr field names to Carrot2 custom field names
    Map<String, String> customFields = getCustomFieldsMap(solrParams);
    // Parse language code map string into a map
    Map<String, String> languageCodeMap = new HashMap<>();
    if (StringUtils.isNotBlank(languageField)) {
        for (String pair : solrParams.get(CarrotParams.LANGUAGE_CODE_MAP, "").split("[, ]")) {
            final String[] split = pair.split(":");
            if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
                languageCodeMap.put(split[0], split[1]);
            } else {
                log.warn("Unsupported format for " + CarrotParams.LANGUAGE_CODE_MAP + ": '" + pair + "'. Skipping this mapping.");
            }
        }
    }
    // Get the documents
    boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY, false);
    SolrQueryRequest req = null;
    String[] snippetFieldAry = null;
    if (produceSummary) {
        highlighter = HighlightComponent.getHighlighter(core);
        if (highlighter != null) {
            Map<String, Object> args = new HashMap<>();
            snippetFieldAry = snippetFieldSpec.split("[, ]");
            args.put(HighlightParams.FIELDS, snippetFieldAry);
            args.put(HighlightParams.HIGHLIGHT, "true");
            //we don't care about actually highlighting the area
            args.put(HighlightParams.SIMPLE_PRE, "");
            args.put(HighlightParams.SIMPLE_POST, "");
            args.put(HighlightParams.FRAGSIZE, solrParams.getInt(CarrotParams.SUMMARY_FRAGSIZE, solrParams.getInt(HighlightParams.FRAGSIZE, 100)));
            args.put(HighlightParams.SNIPPETS, solrParams.getInt(CarrotParams.SUMMARY_SNIPPETS, solrParams.getInt(HighlightParams.SNIPPETS, 1)));
            req = new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) {

                @Override
                public SolrIndexSearcher getSearcher() {
                    return sreq.getSearcher();
                }
            };
        } else {
            log.warn("No highlighter configured, cannot produce summary");
            produceSummary = false;
        }
    }
    Iterator<SolrDocument> docsIter = solrDocList.iterator();
    List<Document> result = new ArrayList<>(solrDocList.size());
    float[] scores = { 1.0f };
    int[] docsHolder = new int[1];
    Query theQuery = query;
    while (docsIter.hasNext()) {
        SolrDocument sdoc = docsIter.next();
        String snippet = null;
        // See comment in ClusteringComponent#finishStage().
        if (produceSummary && docIds != null) {
            docsHolder[0] = docIds.get(sdoc).intValue();
            DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
            NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
            if (highlights != null && highlights.size() == 1) {
                // should only be one value given our setup
                // should only be one document
                @SuppressWarnings("unchecked") NamedList<String[]> tmp = (NamedList<String[]>) highlights.getVal(0);
                final StringBuilder sb = new StringBuilder();
                for (int j = 0; j < snippetFieldAry.length; j++) {
                    // Join fragments with a period, so that Carrot2 does not create
                    // cross-fragment phrases, such phrases rarely make sense.
                    String[] highlt = tmp.get(snippetFieldAry[j]);
                    if (highlt != null && highlt.length > 0) {
                        for (int i = 0; i < highlt.length; i++) {
                            sb.append(highlt[i]);
                            sb.append(" . ");
                        }
                    }
                }
                snippet = sb.toString();
            }
        }
        // If summaries not enabled or summary generation failed, use full content.
        if (snippet == null) {
            snippet = getConcatenated(sdoc, snippetFieldSpec);
        }
        // Create a Carrot2 document
        Document carrotDocument = new Document(getConcatenated(sdoc, titleFieldSpec), snippet, ObjectUtils.toString(sdoc.getFieldValue(urlField), ""));
        // Store Solr id of the document, we need it to map document instances 
        // found in clusters back to identifiers.
        carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
        // Set language
        if (StringUtils.isNotBlank(languageField)) {
            Collection<Object> languages = sdoc.getFieldValues(languageField);
            if (languages != null) {
                // Use the first Carrot2-supported language
                for (Object l : languages) {
                    String lang = ObjectUtils.toString(l, "");
                    if (languageCodeMap.containsKey(lang)) {
                        lang = languageCodeMap.get(lang);
                    }
                    // language variants, such as 'zh-cn', but Carrot2 uses underscores.
                    if (lang.indexOf('-') > 0) {
                        lang = lang.replace('-', '_');
                    }
                    // If the language is supported by Carrot2, we'll get a non-null value
                    final LanguageCode carrot2Language = LanguageCode.forISOCode(lang);
                    if (carrot2Language != null) {
                        carrotDocument.setLanguage(carrot2Language);
                        break;
                    }
                }
            }
        }
        // Add custom fields
        if (customFields != null) {
            for (Entry<String, String> entry : customFields.entrySet()) {
                carrotDocument.setField(entry.getValue(), sdoc.getFieldValue(entry.getKey()));
            }
        }
        result.add(carrotDocument);
    }
    return result;
}
Also used : Query(org.apache.lucene.search.Query) HashMap(java.util.HashMap) SolrCore(org.apache.solr.core.SolrCore) ArrayList(java.util.ArrayList) Document(org.carrot2.core.Document) SolrDocument(org.apache.solr.common.SolrDocument) DocSlice(org.apache.solr.search.DocSlice) LanguageCode(org.carrot2.core.LanguageCode) SolrDocument(org.apache.solr.common.SolrDocument) NamedList(org.apache.solr.common.util.NamedList) SolrIndexSearcher(org.apache.solr.search.SolrIndexSearcher) SolrHighlighter(org.apache.solr.highlight.SolrHighlighter) LocalSolrQueryRequest(org.apache.solr.request.LocalSolrQueryRequest) SolrQueryRequest(org.apache.solr.request.SolrQueryRequest) LocalSolrQueryRequest(org.apache.solr.request.LocalSolrQueryRequest) SolrParams(org.apache.solr.common.params.SolrParams) DocList(org.apache.solr.search.DocList)

Example 3 with Document

use of org.carrot2.core.Document in project lucene-solr by apache.

the class EchoClusteringAlgorithm method process.

@Override
public void process() throws ProcessingException {
    clusters = new ArrayList<>();
    for (Document document : documents) {
        final Cluster cluster = new Cluster();
        cluster.addPhrases(document.getTitle(), document.getSummary());
        if (document.getLanguage() != null) {
            cluster.addPhrases(document.getLanguage().name());
        }
        for (String field : customFields.split(",")) {
            Object value = document.getField(field);
            if (value != null) {
                cluster.addPhrases(value.toString());
            }
        }
        cluster.addDocuments(document);
        clusters.add(cluster);
    }
}
Also used : Cluster(org.carrot2.core.Cluster) Document(org.carrot2.core.Document)

Example 4 with Document

use of org.carrot2.core.Document in project lucene-solr by apache.

the class CarrotClusteringEngine method clustersToNamedList.

private void clustersToNamedList(List<Cluster> outputClusters, List<NamedList<Object>> parent, boolean outputSubClusters, int maxLabels) {
    for (Cluster outCluster : outputClusters) {
        NamedList<Object> cluster = new SimpleOrderedMap<>();
        parent.add(cluster);
        // Add labels
        List<String> labels = outCluster.getPhrases();
        if (labels.size() > maxLabels) {
            labels = labels.subList(0, maxLabels);
        }
        cluster.add("labels", labels);
        // Add cluster score
        final Double score = outCluster.getScore();
        if (score != null) {
            cluster.add("score", score);
        }
        // Add other topics marker
        if (outCluster.isOtherTopics()) {
            cluster.add("other-topics", outCluster.isOtherTopics());
        }
        // Add documents
        List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
        List<Object> docList = new ArrayList<>();
        cluster.add("docs", docList);
        for (Document doc : docs) {
            docList.add(doc.getField(SOLR_DOCUMENT_ID));
        }
        // Add subclusters
        if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
            List<NamedList<Object>> subclusters = new ArrayList<>();
            cluster.add("clusters", subclusters);
            clustersToNamedList(outCluster.getSubclusters(), subclusters, outputSubClusters, maxLabels);
        }
    }
}
Also used : NamedList(org.apache.solr.common.util.NamedList) ArrayList(java.util.ArrayList) Cluster(org.carrot2.core.Cluster) Document(org.carrot2.core.Document) SolrDocument(org.apache.solr.common.SolrDocument) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap)

Aggregations

Document (org.carrot2.core.Document)4 SolrDocument (org.apache.solr.common.SolrDocument)3 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 NamedList (org.apache.solr.common.util.NamedList)2 Cluster (org.carrot2.core.Cluster)2 IOException (java.io.IOException)1 Query (org.apache.lucene.search.Query)1 SolrException (org.apache.solr.common.SolrException)1 SolrParams (org.apache.solr.common.params.SolrParams)1 SimpleOrderedMap (org.apache.solr.common.util.SimpleOrderedMap)1 SolrCore (org.apache.solr.core.SolrCore)1 SolrHighlighter (org.apache.solr.highlight.SolrHighlighter)1 LocalSolrQueryRequest (org.apache.solr.request.LocalSolrQueryRequest)1 SolrQueryRequest (org.apache.solr.request.SolrQueryRequest)1 DocList (org.apache.solr.search.DocList)1 DocSlice (org.apache.solr.search.DocSlice)1 SolrIndexSearcher (org.apache.solr.search.SolrIndexSearcher)1 LanguageCode (org.carrot2.core.LanguageCode)1