use of org.apache.lucene.search.TopScoreDocCollector in project derby by apache.
the class LuceneQueryVTI method initScan.
// ///////////////////////////////////////////////////////////////////
//
// MINIONS
//
// ///////////////////////////////////////////////////////////////////
/**
* Initialize the metadata and scan
*/
private void initScan() throws SQLException {
try {
// read the execution context for this AwareVTI
VTIContext context = getContext();
_schema = context.vtiSchema();
String[] nameParts = LuceneSupport.decodeFunctionName(context.vtiTable());
_table = nameParts[LuceneSupport.TABLE_PART];
_column = nameParts[LuceneSupport.COLUMN_PART];
// divine the column names
VTITemplate.ColumnDescriptor[] returnColumns = getReturnTableSignature(_connection);
String[] columnNames = new String[returnColumns.length];
for (int i = 0; i < returnColumns.length; i++) {
columnNames[i] = returnColumns[i].columnName;
}
setColumnNames(columnNames);
_scoreColumnID = getColumnCount();
_docIDColumnID = _scoreColumnID - 1;
_maxKeyID = _docIDColumnID - 1;
_minKeyID = 1;
// make sure the user has SELECT privilege on all relevant columns of the underlying table
vetPrivileges();
String delimitedColumnName = LuceneSupport.delimitID(_column);
DerbyLuceneDir derbyLuceneDir = LuceneSupport.getDerbyLuceneDir(_connection, _schema, _table, delimitedColumnName);
StorageFile propertiesFile = LuceneSupport.getIndexPropertiesFile(derbyLuceneDir);
Properties indexProperties = readIndexProperties(propertiesFile);
String indexDescriptorMaker = indexProperties.getProperty(LuceneSupport.INDEX_DESCRIPTOR_MAKER);
LuceneIndexDescriptor indexDescriptor = getIndexDescriptor(indexDescriptorMaker);
Analyzer analyzer = indexDescriptor.getAnalyzer();
QueryParser qp = indexDescriptor.getQueryParser();
vetLuceneVersion(indexProperties.getProperty(LuceneSupport.LUCENE_VERSION));
_indexReader = getIndexReader(derbyLuceneDir);
_searcher = new IndexSearcher(_indexReader);
Query luceneQuery = qp.parse(_queryText);
TopScoreDocCollector tsdc = TopScoreDocCollector.create(_windowSize, true);
if (_scoreCeiling != null) {
tsdc = TopScoreDocCollector.create(_windowSize, new ScoreDoc(0, _scoreCeiling), true);
}
searchAndScore(luceneQuery, tsdc);
} catch (IOException ioe) {
throw ToolUtilities.wrap(ioe);
} catch (ParseException pe) {
throw ToolUtilities.wrap(pe);
} catch (PrivilegedActionException pae) {
throw ToolUtilities.wrap(pae);
}
}
use of org.apache.lucene.search.TopScoreDocCollector in project Anserini by castorini.
the class ApproximateNearestNeighborSearch method main.
public static void main(String[] args) throws Exception {
ApproximateNearestNeighborSearch.Args indexArgs = new ApproximateNearestNeighborSearch.Args();
CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: " + ApproximateNearestNeighborSearch.class.getSimpleName() + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
Analyzer vectorAnalyzer;
if (indexArgs.encoding.equalsIgnoreCase(FW)) {
vectorAnalyzer = new FakeWordsEncoderAnalyzer(indexArgs.q);
} else if (indexArgs.encoding.equalsIgnoreCase(LEXLSH)) {
vectorAnalyzer = new LexicalLshAnalyzer(indexArgs.decimals, indexArgs.ngrams, indexArgs.hashCount, indexArgs.bucketCount, indexArgs.hashSetSize);
} else {
parser.printUsage(System.err);
System.err.println("Example: " + ApproximateNearestNeighborSearch.class.getSimpleName() + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
if (!indexArgs.stored && indexArgs.input == null) {
System.err.println("Either -path or -stored args must be set");
return;
}
Path indexDir = indexArgs.path;
if (!Files.exists(indexDir)) {
Files.createDirectories(indexDir);
}
System.out.println(String.format("Reading index at %s", indexArgs.path));
Directory d = FSDirectory.open(indexDir);
DirectoryReader reader = DirectoryReader.open(d);
IndexSearcher searcher = new IndexSearcher(reader);
if (indexArgs.encoding.equalsIgnoreCase(FW)) {
searcher.setSimilarity(new ClassicSimilarity());
}
Collection<String> vectorStrings = new LinkedList<>();
if (indexArgs.stored) {
TopDocs topDocs = searcher.search(new TermQuery(new Term(IndexVectors.FIELD_ID, indexArgs.word)), indexArgs.depth);
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
vectorStrings.add(reader.document(scoreDoc.doc).get(IndexVectors.FIELD_VECTOR));
}
} else {
System.out.println(String.format("Loading model %s", indexArgs.input));
Map<String, List<float[]>> wordVectors = IndexVectors.readGloVe(indexArgs.input);
if (wordVectors.containsKey(indexArgs.word)) {
List<float[]> vectors = wordVectors.get(indexArgs.word);
for (float[] vector : vectors) {
StringBuilder sb = new StringBuilder();
for (double fv : vector) {
if (sb.length() > 0) {
sb.append(' ');
}
sb.append(fv);
}
String vectorString = sb.toString();
vectorStrings.add(vectorString);
}
}
}
for (String vectorString : vectorStrings) {
float msm = indexArgs.msm;
float cutoff = indexArgs.cutoff;
CommonTermsQuery simQuery = new CommonTermsQuery(SHOULD, SHOULD, cutoff);
for (String token : AnalyzerUtils.analyze(vectorAnalyzer, vectorString)) {
simQuery.add(new Term(IndexVectors.FIELD_VECTOR, token));
}
if (msm > 0) {
simQuery.setHighFreqMinimumNumberShouldMatch(msm);
simQuery.setLowFreqMinimumNumberShouldMatch(msm);
}
long start = System.currentTimeMillis();
TopScoreDocCollector results = TopScoreDocCollector.create(indexArgs.depth, Integer.MAX_VALUE);
searcher.search(simQuery, results);
long time = System.currentTimeMillis() - start;
System.out.println(String.format("%d nearest neighbors of '%s':", indexArgs.depth, indexArgs.word));
int rank = 1;
for (ScoreDoc sd : results.topDocs().scoreDocs) {
Document document = reader.document(sd.doc);
String word = document.get(IndexVectors.FIELD_ID);
System.out.println(String.format("%d. %s (%.3f)", rank, word, sd.score));
rank++;
}
System.out.println(String.format("Search time: %dms", time));
}
reader.close();
d.close();
}
use of org.apache.lucene.search.TopScoreDocCollector in project Anserini by castorini.
the class ApproximateNearestNeighborEval method main.
public static void main(String[] args) throws Exception {
ApproximateNearestNeighborEval.Args indexArgs = new ApproximateNearestNeighborEval.Args();
CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: " + ApproximateNearestNeighborEval.class.getSimpleName() + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
Analyzer vectorAnalyzer;
if (indexArgs.encoding.equalsIgnoreCase(FW)) {
vectorAnalyzer = new FakeWordsEncoderAnalyzer(indexArgs.q);
} else if (indexArgs.encoding.equalsIgnoreCase(LEXLSH)) {
vectorAnalyzer = new LexicalLshAnalyzer(indexArgs.decimals, indexArgs.ngrams, indexArgs.hashCount, indexArgs.bucketCount, indexArgs.hashSetSize);
} else {
parser.printUsage(System.err);
System.err.println("Example: " + ApproximateNearestNeighborEval.class.getSimpleName() + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
System.out.println(String.format("Loading model %s", indexArgs.input));
Map<String, List<float[]>> wordVectors = IndexVectors.readGloVe(indexArgs.input);
Path indexDir = indexArgs.path;
if (!Files.exists(indexDir)) {
Files.createDirectories(indexDir);
}
System.out.println(String.format("Reading index at %s", indexArgs.path));
Directory d = FSDirectory.open(indexDir);
DirectoryReader reader = DirectoryReader.open(d);
IndexSearcher searcher = new IndexSearcher(reader);
if (indexArgs.encoding.equalsIgnoreCase(FW)) {
searcher.setSimilarity(new ClassicSimilarity());
}
StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
double recall = 0;
double time = 0d;
System.out.println("Evaluating at retrieval depth: " + indexArgs.depth);
TrecTopicReader trecTopicReader = new TrecTopicReader(indexArgs.topicsPath);
Collection<String> words = new LinkedList<>();
trecTopicReader.read().values().forEach(e -> words.addAll(AnalyzerUtils.analyze(standardAnalyzer, e.get("title"))));
int queryCount = 0;
for (String word : words) {
if (wordVectors.containsKey(word)) {
Set<String> truth = nearestVector(wordVectors, word, indexArgs.topN);
try {
List<float[]> vectors = wordVectors.get(word);
for (float[] vector : vectors) {
StringBuilder sb = new StringBuilder();
for (double fv : vector) {
if (sb.length() > 0) {
sb.append(' ');
}
sb.append(fv);
}
String fvString = sb.toString();
CommonTermsQuery simQuery = new CommonTermsQuery(SHOULD, SHOULD, indexArgs.cutoff);
if (indexArgs.msm > 0) {
simQuery.setLowFreqMinimumNumberShouldMatch(indexArgs.msm);
}
for (String token : AnalyzerUtils.analyze(vectorAnalyzer, fvString)) {
simQuery.add(new Term(IndexVectors.FIELD_VECTOR, token));
}
long start = System.currentTimeMillis();
TopScoreDocCollector results = TopScoreDocCollector.create(indexArgs.depth, Integer.MAX_VALUE);
searcher.search(simQuery, results);
time += System.currentTimeMillis() - start;
Set<String> observations = new HashSet<>();
for (ScoreDoc sd : results.topDocs().scoreDocs) {
Document document = reader.document(sd.doc);
String wordValue = document.get(IndexVectors.FIELD_ID);
observations.add(wordValue);
}
double intersection = Sets.intersection(truth, observations).size();
double localRecall = intersection / (double) truth.size();
recall += localRecall;
queryCount++;
}
} catch (IOException e) {
System.err.println("search for '" + word + "' failed " + e.getLocalizedMessage());
}
}
if (queryCount >= indexArgs.samples) {
break;
}
}
recall /= queryCount;
time /= queryCount;
System.out.println(String.format("R@%d: %.4f", indexArgs.depth, recall));
System.out.println(String.format("avg query time: %s ms", time));
reader.close();
d.close();
}
use of org.apache.lucene.search.TopScoreDocCollector in project neo4j by neo4j.
the class DocValuesCollector method getTopDocsByRelevance.
private TopDocs getTopDocsByRelevance(int size) throws IOException {
TopScoreDocCollector collector = TopScoreDocCollector.create(size, size);
replayTo(collector);
return collector.topDocs();
}
use of org.apache.lucene.search.TopScoreDocCollector in project gitblit by gitblit.
the class LuceneService method search.
/**
* Searches the specified repositories for the given text or query
*
* @param text
* if the text is null or empty, null is returned
* @param page
* the page number to retrieve. page is 1-indexed.
* @param pageSize
* the number of elements to return for this page
* @param repositories
* a list of repositories to search. if no repositories are
* specified null is returned.
* @return a list of SearchResults in order from highest to the lowest score
*/
public List<SearchResult> search(String text, int page, int pageSize, String... repositories) {
if (StringUtils.isEmpty(text)) {
return null;
}
if (ArrayUtils.isEmpty(repositories)) {
return null;
}
Set<SearchResult> results = new LinkedHashSet<SearchResult>();
StandardAnalyzer analyzer = new StandardAnalyzer();
try {
// default search checks summary and content
BooleanQuery.Builder bldr = new BooleanQuery.Builder();
QueryParser qp;
qp = new QueryParser(FIELD_SUMMARY, analyzer);
qp.setAllowLeadingWildcard(true);
bldr.add(qp.parse(text), Occur.SHOULD);
qp = new QueryParser(FIELD_CONTENT, analyzer);
qp.setAllowLeadingWildcard(true);
bldr.add(qp.parse(text), Occur.SHOULD);
IndexSearcher searcher;
if (repositories.length == 1) {
// single repository search
searcher = getIndexSearcher(repositories[0]);
} else {
// multiple repository search
List<IndexReader> readers = new ArrayList<IndexReader>();
for (String repository : repositories) {
IndexSearcher repositoryIndex = getIndexSearcher(repository);
readers.add(repositoryIndex.getIndexReader());
}
IndexReader[] rdrs = readers.toArray(new IndexReader[readers.size()]);
MultiSourceReader reader = new MultiSourceReader(rdrs);
searcher = new IndexSearcher(reader);
}
BooleanQuery query = bldr.build();
Query rewrittenQuery = searcher.rewrite(query);
logger.debug(rewrittenQuery.toString());
TopScoreDocCollector collector = TopScoreDocCollector.create(5000);
searcher.search(rewrittenQuery, collector);
int offset = Math.max(0, (page - 1) * pageSize);
ScoreDoc[] hits = collector.topDocs(offset, pageSize).scoreDocs;
int totalHits = collector.getTotalHits();
for (int i = 0; i < hits.length; i++) {
int docId = hits[i].doc;
Document doc = searcher.doc(docId);
SearchResult result = createSearchResult(doc, hits[i].score, offset + i + 1, totalHits);
if (repositories.length == 1) {
// single repository search
result.repository = repositories[0];
} else {
// multi-repository search
MultiSourceReader reader = (MultiSourceReader) searcher.getIndexReader();
int index = reader.getSourceIndex(docId);
result.repository = repositories[index];
}
String content = doc.get(FIELD_CONTENT);
result.fragment = getHighlightedFragment(analyzer, query, content, result);
results.add(result);
}
} catch (Exception e) {
logger.error(MessageFormat.format("Exception while searching for {0}", text), e);
}
return new ArrayList<SearchResult>(results);
}
Aggregations