use of org.apache.stanbol.commons.solr.utils.StreamQueryRequest in project stanbol by apache.
the class TopicClassificationEngine method suggestTopics.
public List<TopicSuggestion> suggestTopics(String text) throws ClassifierException {
List<TopicSuggestion> suggestedTopics = new ArrayList<TopicSuggestion>(MAX_SUGGESTIONS * 3);
SolrServer solrServer = getActiveSolrServer();
SolrQuery query = new SolrQuery();
query.setRequestHandler("/" + MoreLikeThisParams.MLT);
query.setFilterQueries(entryTypeField + ":" + MODEL_ENTRY);
query.set(MoreLikeThisParams.MATCH_INCLUDE, false);
query.set(MoreLikeThisParams.MIN_DOC_FREQ, 1);
query.set(MoreLikeThisParams.MIN_TERM_FREQ, 1);
query.set(MoreLikeThisParams.MAX_QUERY_TERMS, 30);
query.set(MoreLikeThisParams.MAX_NUM_TOKENS_PARSED, 10000);
// TODO: find a way to parse the interesting terms and report them
// for debugging / explanation in dedicated RDF data structure.
// query.set(MoreLikeThisParams.INTERESTING_TERMS, "details");
query.set(MoreLikeThisParams.SIMILARITY_FIELDS, similarityField);
query.set(CommonParams.STREAM_BODY, text);
// over query the number of suggestions to find a statistical cut based on the curve of the scores of
// the top suggestion
query.setRows(MAX_SUGGESTIONS * 3);
query.setFields(conceptUriField);
query.setIncludeScore(true);
try {
StreamQueryRequest request = new StreamQueryRequest(query);
QueryResponse response = request.process(solrServer);
SolrDocumentList results = response.getResults();
for (SolrDocument result : results.toArray(new SolrDocument[0])) {
String conceptUri = (String) result.getFirstValue(conceptUriField);
if (conceptUri == null) {
throw new ClassifierException(String.format("Solr Core '%s' is missing required field '%s'.", solrCoreId, conceptUriField));
}
Float score = (Float) result.getFirstValue("score");
// fetch metadata
SolrQuery metadataQuery = new SolrQuery("*:*");
// use filter queries to leverage the Solr cache explicitly
metadataQuery.addFilterQuery(entryTypeField + ":" + METADATA_ENTRY);
metadataQuery.addFilterQuery(conceptUriField + ":" + ClientUtils.escapeQueryChars(conceptUri));
metadataQuery.setFields(conceptUriField, broaderField, primaryTopicUriField);
SolrDocument metadata = solrServer.query(metadataQuery).getResults().get(0);
String primaryTopicUri = (String) metadata.getFirstValue(primaryTopicUriField);
suggestedTopics.add(new TopicSuggestion(conceptUri, primaryTopicUri, metadata.getFieldValues(broaderField), score));
}
} catch (SolrServerException e) {
if ("unknown handler: /mlt".equals(e.getCause().getMessage())) {
String message = String.format("SolrServer with id '%s' for topic engine '%s' lacks" + " configuration for the MoreLikeThisHandler", solrCoreId, engineName);
throw new ClassifierException(message, e);
} else {
throw new ClassifierException(e);
}
}
if (suggestedTopics.size() <= 1) {
// no need to apply the cutting heuristic
return suggestedTopics;
}
// filter out suggestions that are less than some threshold based on the mean of the top scores
float mean = 0.0f;
for (TopicSuggestion suggestion : suggestedTopics) {
mean += suggestion.score / suggestedTopics.size();
}
float threshold = 0.25f * suggestedTopics.get(0).score + 0.75f * mean;
List<TopicSuggestion> filteredSuggestions = new ArrayList<TopicSuggestion>();
for (TopicSuggestion suggestion : suggestedTopics) {
if (filteredSuggestions.size() >= MAX_SUGGESTIONS) {
return filteredSuggestions;
}
if (filteredSuggestions.isEmpty() || suggestion.score > threshold) {
filteredSuggestions.add(suggestion);
} else {
break;
}
}
return filteredSuggestions;
}
use of org.apache.stanbol.commons.solr.utils.StreamQueryRequest in project stanbol by apache.
the class SolrYard method find.
private QueryResultList<Representation> find(final FieldQuery parsedQuery, SELECT select) throws YardException {
// create a clone of the query, because we need to refine it because the
// query (as executed) needs to be included in the result set
FieldQuery fieldQuery = parsedQuery.clone();
log.debug("find " + fieldQuery);
long start = System.currentTimeMillis();
final Set<String> selected;
if (select == SELECT.QUERY) {
// if query set the fields to add to the result Representations
selected = new HashSet<String>(fieldQuery.getSelectedFields());
// add the score to query results!
selected.add(RdfResourceEnum.resultScore.getUri());
} else {
// otherwise add all fields
selected = null;
}
final SolrQuery query = solrQueryFactoy.parseFieldQuery(fieldQuery, select);
long queryGeneration = System.currentTimeMillis();
if (closed) {
log.warn("The SolrYard '{}' was already closed!", config.getName());
}
QueryResponse response;
try {
response = AccessController.doPrivileged(new PrivilegedExceptionAction<QueryResponse>() {
public QueryResponse run() throws IOException, SolrServerException {
StreamQueryRequest request = new StreamQueryRequest(query);
return request.process(server);
}
});
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if (e instanceof SolrServerException) {
if ("unknown handler: /mlt".equals(e.getCause().getMessage())) {
throw new YardException("Solr is missing '<requestHandler name=\"/mlt\"" + " class=\"solr.MoreLikeThisHandler\" startup=\"lazy\" />'" + " in 'solrconfig.xml'", e);
}
throw new YardException("Error while performing Query on SolrServer: " + query.getQuery(), e);
} else if (e instanceof IOException) {
throw new YardException("Unable to access SolrServer", e);
} else {
throw RuntimeException.class.cast(e);
}
}
if (SolrQueryFactory.MLT_QUERY_TYPE.equals(query.getRequestHandler())) {
log.debug("{}", response);
}
long queryTime = System.currentTimeMillis();
// return a queryResultList
QueryResultListImpl<Representation> resultList = new QueryResultListImpl<Representation>(fieldQuery, // by adapting SolrDocuments to Representations
new AdaptingIterator<SolrDocument, Representation>(response.getResults().iterator(), // inline Adapter Implementation
new AdaptingIterator.Adapter<SolrDocument, Representation>() {
@Override
public Representation adapt(SolrDocument doc, Class<Representation> type) {
// use this method for the conversion!
return createRepresentation(doc, selected);
}
}, Representation.class), Representation.class);
long resultProcessing = System.currentTimeMillis();
log.debug(String.format(" ... done [queryGeneration=%dms|queryTime=%dms|resultProcessing=%dms|sum=%dms]", (queryGeneration - start), (queryTime - queryGeneration), (resultProcessing - queryTime), (resultProcessing - start)));
return resultList;
}
use of org.apache.stanbol.commons.solr.utils.StreamQueryRequest in project stanbol by apache.
the class TopicEngineTest method loadSampleTopicsFromTSV.
protected void loadSampleTopicsFromTSV() throws IOException, SolrServerException {
assertNotNull(classifierSolrServer);
String topicSnippetsPath = "/topics_abstracts_snippet.tsv";
InputStream is = getClass().getResourceAsStream(topicSnippetsPath);
assertNotNull("Could not find test resource: " + topicSnippetsPath, is);
// Build a query for the CSV importer
SolrQuery query = new SolrQuery();
query.setQueryType("/update/csv");
query.set("commit", true);
query.set("separator", "\t");
query.set("headers", false);
query.set("fieldnames", "topic,popularity,broader,text");
query.set(CommonParams.STREAM_CONTENTTYPE, "text/plan;charset=utf-8");
query.set(CommonParams.STREAM_BODY, IOUtils.toString(is, "utf-8"));
// Upload an index
QueryResponse response = new StreamQueryRequest(query).process(classifierSolrServer);
assertNotNull(response);
log.info(String.format("Indexed test topics in %dms", response.getElapsedTime()));
}
Aggregations