Search in sources :

Example 11 with TopScoreDocCollector

use of org.apache.lucene.search.TopScoreDocCollector in project derby by apache.

the class LuceneQueryVTI method initScan.

// ///////////////////////////////////////////////////////////////////
// 
// MINIONS
// 
// ///////////////////////////////////////////////////////////////////
/**
 * Initialize the metadata and scan
 */
private void initScan() throws SQLException {
    try {
        // read the execution context for this AwareVTI
        VTIContext context = getContext();
        _schema = context.vtiSchema();
        String[] nameParts = LuceneSupport.decodeFunctionName(context.vtiTable());
        _table = nameParts[LuceneSupport.TABLE_PART];
        _column = nameParts[LuceneSupport.COLUMN_PART];
        // divine the column names
        VTITemplate.ColumnDescriptor[] returnColumns = getReturnTableSignature(_connection);
        String[] columnNames = new String[returnColumns.length];
        for (int i = 0; i < returnColumns.length; i++) {
            columnNames[i] = returnColumns[i].columnName;
        }
        setColumnNames(columnNames);
        _scoreColumnID = getColumnCount();
        _docIDColumnID = _scoreColumnID - 1;
        _maxKeyID = _docIDColumnID - 1;
        _minKeyID = 1;
        // make sure the user has SELECT privilege on all relevant columns of the underlying table
        vetPrivileges();
        String delimitedColumnName = LuceneSupport.delimitID(_column);
        DerbyLuceneDir derbyLuceneDir = LuceneSupport.getDerbyLuceneDir(_connection, _schema, _table, delimitedColumnName);
        StorageFile propertiesFile = LuceneSupport.getIndexPropertiesFile(derbyLuceneDir);
        Properties indexProperties = readIndexProperties(propertiesFile);
        String indexDescriptorMaker = indexProperties.getProperty(LuceneSupport.INDEX_DESCRIPTOR_MAKER);
        LuceneIndexDescriptor indexDescriptor = getIndexDescriptor(indexDescriptorMaker);
        Analyzer analyzer = indexDescriptor.getAnalyzer();
        QueryParser qp = indexDescriptor.getQueryParser();
        vetLuceneVersion(indexProperties.getProperty(LuceneSupport.LUCENE_VERSION));
        _indexReader = getIndexReader(derbyLuceneDir);
        _searcher = new IndexSearcher(_indexReader);
        Query luceneQuery = qp.parse(_queryText);
        TopScoreDocCollector tsdc = TopScoreDocCollector.create(_windowSize, true);
        if (_scoreCeiling != null) {
            tsdc = TopScoreDocCollector.create(_windowSize, new ScoreDoc(0, _scoreCeiling), true);
        }
        searchAndScore(luceneQuery, tsdc);
    } catch (IOException ioe) {
        throw ToolUtilities.wrap(ioe);
    } catch (ParseException pe) {
        throw ToolUtilities.wrap(pe);
    } catch (PrivilegedActionException pae) {
        throw ToolUtilities.wrap(pae);
    }
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) VTIContext(org.apache.derby.vti.VTIContext) Query(org.apache.lucene.search.Query) TopScoreDocCollector(org.apache.lucene.search.TopScoreDocCollector) PrivilegedActionException(java.security.PrivilegedActionException) IOException(java.io.IOException) Properties(java.util.Properties) Analyzer(org.apache.lucene.analysis.Analyzer) ScoreDoc(org.apache.lucene.search.ScoreDoc) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) LuceneIndexDescriptor(org.apache.derby.optional.api.LuceneIndexDescriptor) StorageFile(org.apache.derby.io.StorageFile) ParseException(org.apache.lucene.queryparser.classic.ParseException)

Example 12 with TopScoreDocCollector

use of org.apache.lucene.search.TopScoreDocCollector in project Anserini by castorini.

the class ApproximateNearestNeighborSearch method main.

public static void main(String[] args) throws Exception {
    ApproximateNearestNeighborSearch.Args indexArgs = new ApproximateNearestNeighborSearch.Args();
    CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: " + ApproximateNearestNeighborSearch.class.getSimpleName() + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    Analyzer vectorAnalyzer;
    if (indexArgs.encoding.equalsIgnoreCase(FW)) {
        vectorAnalyzer = new FakeWordsEncoderAnalyzer(indexArgs.q);
    } else if (indexArgs.encoding.equalsIgnoreCase(LEXLSH)) {
        vectorAnalyzer = new LexicalLshAnalyzer(indexArgs.decimals, indexArgs.ngrams, indexArgs.hashCount, indexArgs.bucketCount, indexArgs.hashSetSize);
    } else {
        parser.printUsage(System.err);
        System.err.println("Example: " + ApproximateNearestNeighborSearch.class.getSimpleName() + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    if (!indexArgs.stored && indexArgs.input == null) {
        System.err.println("Either -path or -stored args must be set");
        return;
    }
    Path indexDir = indexArgs.path;
    if (!Files.exists(indexDir)) {
        Files.createDirectories(indexDir);
    }
    System.out.println(String.format("Reading index at %s", indexArgs.path));
    Directory d = FSDirectory.open(indexDir);
    DirectoryReader reader = DirectoryReader.open(d);
    IndexSearcher searcher = new IndexSearcher(reader);
    if (indexArgs.encoding.equalsIgnoreCase(FW)) {
        searcher.setSimilarity(new ClassicSimilarity());
    }
    Collection<String> vectorStrings = new LinkedList<>();
    if (indexArgs.stored) {
        TopDocs topDocs = searcher.search(new TermQuery(new Term(IndexVectors.FIELD_ID, indexArgs.word)), indexArgs.depth);
        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            vectorStrings.add(reader.document(scoreDoc.doc).get(IndexVectors.FIELD_VECTOR));
        }
    } else {
        System.out.println(String.format("Loading model %s", indexArgs.input));
        Map<String, List<float[]>> wordVectors = IndexVectors.readGloVe(indexArgs.input);
        if (wordVectors.containsKey(indexArgs.word)) {
            List<float[]> vectors = wordVectors.get(indexArgs.word);
            for (float[] vector : vectors) {
                StringBuilder sb = new StringBuilder();
                for (double fv : vector) {
                    if (sb.length() > 0) {
                        sb.append(' ');
                    }
                    sb.append(fv);
                }
                String vectorString = sb.toString();
                vectorStrings.add(vectorString);
            }
        }
    }
    for (String vectorString : vectorStrings) {
        float msm = indexArgs.msm;
        float cutoff = indexArgs.cutoff;
        CommonTermsQuery simQuery = new CommonTermsQuery(SHOULD, SHOULD, cutoff);
        for (String token : AnalyzerUtils.analyze(vectorAnalyzer, vectorString)) {
            simQuery.add(new Term(IndexVectors.FIELD_VECTOR, token));
        }
        if (msm > 0) {
            simQuery.setHighFreqMinimumNumberShouldMatch(msm);
            simQuery.setLowFreqMinimumNumberShouldMatch(msm);
        }
        long start = System.currentTimeMillis();
        TopScoreDocCollector results = TopScoreDocCollector.create(indexArgs.depth, Integer.MAX_VALUE);
        searcher.search(simQuery, results);
        long time = System.currentTimeMillis() - start;
        System.out.println(String.format("%d nearest neighbors of '%s':", indexArgs.depth, indexArgs.word));
        int rank = 1;
        for (ScoreDoc sd : results.topDocs().scoreDocs) {
            Document document = reader.document(sd.doc);
            String word = document.get(IndexVectors.FIELD_ID);
            System.out.println(String.format("%d. %s (%.3f)", rank, word, sd.score));
            rank++;
        }
        System.out.println(String.format("Search time: %dms", time));
    }
    reader.close();
    d.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) LexicalLshAnalyzer(io.anserini.ann.lexlsh.LexicalLshAnalyzer) FakeWordsEncoderAnalyzer(io.anserini.ann.fw.FakeWordsEncoderAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) ScoreDoc(org.apache.lucene.search.ScoreDoc) CommonTermsQuery(org.apache.lucene.queries.CommonTermsQuery) TopDocs(org.apache.lucene.search.TopDocs) LinkedList(java.util.LinkedList) List(java.util.List) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Path(java.nio.file.Path) TermQuery(org.apache.lucene.search.TermQuery) CmdLineParser(org.kohsuke.args4j.CmdLineParser) DirectoryReader(org.apache.lucene.index.DirectoryReader) TopScoreDocCollector(org.apache.lucene.search.TopScoreDocCollector) FakeWordsEncoderAnalyzer(io.anserini.ann.fw.FakeWordsEncoderAnalyzer) LexicalLshAnalyzer(io.anserini.ann.lexlsh.LexicalLshAnalyzer) Term(org.apache.lucene.index.Term) LinkedList(java.util.LinkedList) CmdLineException(org.kohsuke.args4j.CmdLineException)

Example 13 with TopScoreDocCollector

use of org.apache.lucene.search.TopScoreDocCollector in project Anserini by castorini.

the class ApproximateNearestNeighborEval method main.

public static void main(String[] args) throws Exception {
    ApproximateNearestNeighborEval.Args indexArgs = new ApproximateNearestNeighborEval.Args();
    CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: " + ApproximateNearestNeighborEval.class.getSimpleName() + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    Analyzer vectorAnalyzer;
    if (indexArgs.encoding.equalsIgnoreCase(FW)) {
        vectorAnalyzer = new FakeWordsEncoderAnalyzer(indexArgs.q);
    } else if (indexArgs.encoding.equalsIgnoreCase(LEXLSH)) {
        vectorAnalyzer = new LexicalLshAnalyzer(indexArgs.decimals, indexArgs.ngrams, indexArgs.hashCount, indexArgs.bucketCount, indexArgs.hashSetSize);
    } else {
        parser.printUsage(System.err);
        System.err.println("Example: " + ApproximateNearestNeighborEval.class.getSimpleName() + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    System.out.println(String.format("Loading model %s", indexArgs.input));
    Map<String, List<float[]>> wordVectors = IndexVectors.readGloVe(indexArgs.input);
    Path indexDir = indexArgs.path;
    if (!Files.exists(indexDir)) {
        Files.createDirectories(indexDir);
    }
    System.out.println(String.format("Reading index at %s", indexArgs.path));
    Directory d = FSDirectory.open(indexDir);
    DirectoryReader reader = DirectoryReader.open(d);
    IndexSearcher searcher = new IndexSearcher(reader);
    if (indexArgs.encoding.equalsIgnoreCase(FW)) {
        searcher.setSimilarity(new ClassicSimilarity());
    }
    StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
    double recall = 0;
    double time = 0d;
    System.out.println("Evaluating at retrieval depth: " + indexArgs.depth);
    TrecTopicReader trecTopicReader = new TrecTopicReader(indexArgs.topicsPath);
    Collection<String> words = new LinkedList<>();
    trecTopicReader.read().values().forEach(e -> words.addAll(AnalyzerUtils.analyze(standardAnalyzer, e.get("title"))));
    int queryCount = 0;
    for (String word : words) {
        if (wordVectors.containsKey(word)) {
            Set<String> truth = nearestVector(wordVectors, word, indexArgs.topN);
            try {
                List<float[]> vectors = wordVectors.get(word);
                for (float[] vector : vectors) {
                    StringBuilder sb = new StringBuilder();
                    for (double fv : vector) {
                        if (sb.length() > 0) {
                            sb.append(' ');
                        }
                        sb.append(fv);
                    }
                    String fvString = sb.toString();
                    CommonTermsQuery simQuery = new CommonTermsQuery(SHOULD, SHOULD, indexArgs.cutoff);
                    if (indexArgs.msm > 0) {
                        simQuery.setLowFreqMinimumNumberShouldMatch(indexArgs.msm);
                    }
                    for (String token : AnalyzerUtils.analyze(vectorAnalyzer, fvString)) {
                        simQuery.add(new Term(IndexVectors.FIELD_VECTOR, token));
                    }
                    long start = System.currentTimeMillis();
                    TopScoreDocCollector results = TopScoreDocCollector.create(indexArgs.depth, Integer.MAX_VALUE);
                    searcher.search(simQuery, results);
                    time += System.currentTimeMillis() - start;
                    Set<String> observations = new HashSet<>();
                    for (ScoreDoc sd : results.topDocs().scoreDocs) {
                        Document document = reader.document(sd.doc);
                        String wordValue = document.get(IndexVectors.FIELD_ID);
                        observations.add(wordValue);
                    }
                    double intersection = Sets.intersection(truth, observations).size();
                    double localRecall = intersection / (double) truth.size();
                    recall += localRecall;
                    queryCount++;
                }
            } catch (IOException e) {
                System.err.println("search for '" + word + "' failed " + e.getLocalizedMessage());
            }
        }
        if (queryCount >= indexArgs.samples) {
            break;
        }
    }
    recall /= queryCount;
    time /= queryCount;
    System.out.println(String.format("R@%d: %.4f", indexArgs.depth, recall));
    System.out.println(String.format("avg query time: %s ms", time));
    reader.close();
    d.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) LexicalLshAnalyzer(io.anserini.ann.lexlsh.LexicalLshAnalyzer) FakeWordsEncoderAnalyzer(io.anserini.ann.fw.FakeWordsEncoderAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Document(org.apache.lucene.document.Document) CommonTermsQuery(org.apache.lucene.queries.CommonTermsQuery) ScoreDoc(org.apache.lucene.search.ScoreDoc) LinkedList(java.util.LinkedList) List(java.util.List) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) HashSet(java.util.HashSet) Path(java.nio.file.Path) CmdLineParser(org.kohsuke.args4j.CmdLineParser) DirectoryReader(org.apache.lucene.index.DirectoryReader) TopScoreDocCollector(org.apache.lucene.search.TopScoreDocCollector) FakeWordsEncoderAnalyzer(io.anserini.ann.fw.FakeWordsEncoderAnalyzer) LexicalLshAnalyzer(io.anserini.ann.lexlsh.LexicalLshAnalyzer) Term(org.apache.lucene.index.Term) IOException(java.io.IOException) LinkedList(java.util.LinkedList) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) CmdLineException(org.kohsuke.args4j.CmdLineException) TrecTopicReader(io.anserini.search.topicreader.TrecTopicReader)

Example 14 with TopScoreDocCollector

use of org.apache.lucene.search.TopScoreDocCollector in project neo4j by neo4j.

the class DocValuesCollector method getTopDocsByRelevance.

private TopDocs getTopDocsByRelevance(int size) throws IOException {
    TopScoreDocCollector collector = TopScoreDocCollector.create(size, size);
    replayTo(collector);
    return collector.topDocs();
}
Also used : TopScoreDocCollector(org.apache.lucene.search.TopScoreDocCollector)

Example 15 with TopScoreDocCollector

use of org.apache.lucene.search.TopScoreDocCollector in project gitblit by gitblit.

the class LuceneService method search.

/**
 * Searches the specified repositories for the given text or query
 *
 * @param text
 *            if the text is null or empty, null is returned
 * @param page
 *            the page number to retrieve. page is 1-indexed.
 * @param pageSize
 *            the number of elements to return for this page
 * @param repositories
 *            a list of repositories to search. if no repositories are
 *            specified null is returned.
 * @return a list of SearchResults in order from highest to the lowest score
 */
public List<SearchResult> search(String text, int page, int pageSize, String... repositories) {
    if (StringUtils.isEmpty(text)) {
        return null;
    }
    if (ArrayUtils.isEmpty(repositories)) {
        return null;
    }
    Set<SearchResult> results = new LinkedHashSet<SearchResult>();
    StandardAnalyzer analyzer = new StandardAnalyzer();
    try {
        // default search checks summary and content
        BooleanQuery.Builder bldr = new BooleanQuery.Builder();
        QueryParser qp;
        qp = new QueryParser(FIELD_SUMMARY, analyzer);
        qp.setAllowLeadingWildcard(true);
        bldr.add(qp.parse(text), Occur.SHOULD);
        qp = new QueryParser(FIELD_CONTENT, analyzer);
        qp.setAllowLeadingWildcard(true);
        bldr.add(qp.parse(text), Occur.SHOULD);
        IndexSearcher searcher;
        if (repositories.length == 1) {
            // single repository search
            searcher = getIndexSearcher(repositories[0]);
        } else {
            // multiple repository search
            List<IndexReader> readers = new ArrayList<IndexReader>();
            for (String repository : repositories) {
                IndexSearcher repositoryIndex = getIndexSearcher(repository);
                readers.add(repositoryIndex.getIndexReader());
            }
            IndexReader[] rdrs = readers.toArray(new IndexReader[readers.size()]);
            MultiSourceReader reader = new MultiSourceReader(rdrs);
            searcher = new IndexSearcher(reader);
        }
        BooleanQuery query = bldr.build();
        Query rewrittenQuery = searcher.rewrite(query);
        logger.debug(rewrittenQuery.toString());
        TopScoreDocCollector collector = TopScoreDocCollector.create(5000);
        searcher.search(rewrittenQuery, collector);
        int offset = Math.max(0, (page - 1) * pageSize);
        ScoreDoc[] hits = collector.topDocs(offset, pageSize).scoreDocs;
        int totalHits = collector.getTotalHits();
        for (int i = 0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document doc = searcher.doc(docId);
            SearchResult result = createSearchResult(doc, hits[i].score, offset + i + 1, totalHits);
            if (repositories.length == 1) {
                // single repository search
                result.repository = repositories[0];
            } else {
                // multi-repository search
                MultiSourceReader reader = (MultiSourceReader) searcher.getIndexReader();
                int index = reader.getSourceIndex(docId);
                result.repository = repositories[index];
            }
            String content = doc.get(FIELD_CONTENT);
            result.fragment = getHighlightedFragment(analyzer, query, content, result);
            results.add(result);
        }
    } catch (Exception e) {
        logger.error(MessageFormat.format("Exception while searching for {0}", text), e);
    }
    return new ArrayList<SearchResult>(results);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) IndexSearcher(org.apache.lucene.search.IndexSearcher) BooleanQuery(org.apache.lucene.search.BooleanQuery) Query(org.apache.lucene.search.Query) BooleanQuery(org.apache.lucene.search.BooleanQuery) TopScoreDocCollector(org.apache.lucene.search.TopScoreDocCollector) ArrayList(java.util.ArrayList) SearchResult(com.gitblit.models.SearchResult) Document(org.apache.lucene.document.Document) ParseException(java.text.ParseException) InvalidTokenOffsetsException(org.apache.lucene.search.highlight.InvalidTokenOffsetsException) IOException(java.io.IOException) ScoreDoc(org.apache.lucene.search.ScoreDoc) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) IndexReader(org.apache.lucene.index.IndexReader)

Aggregations

TopScoreDocCollector (org.apache.lucene.search.TopScoreDocCollector)15 IndexSearcher (org.apache.lucene.search.IndexSearcher)11 Query (org.apache.lucene.search.Query)9 ScoreDoc (org.apache.lucene.search.ScoreDoc)8 Document (org.apache.lucene.document.Document)7 IOException (java.io.IOException)6 QueryParser (org.apache.lucene.queryparser.classic.QueryParser)6 TopDocs (org.apache.lucene.search.TopDocs)5 Term (org.apache.lucene.index.Term)4 BooleanQuery (org.apache.lucene.search.BooleanQuery)4 ArrayList (java.util.ArrayList)3 Analyzer (org.apache.lucene.analysis.Analyzer)3 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)3 TermQuery (org.apache.lucene.search.TermQuery)3 FakeWordsEncoderAnalyzer (io.anserini.ann.fw.FakeWordsEncoderAnalyzer)2 LexicalLshAnalyzer (io.anserini.ann.lexlsh.LexicalLshAnalyzer)2 Path (java.nio.file.Path)2 ParseException (java.text.ParseException)2 LinkedHashSet (java.util.LinkedHashSet)2 LinkedList (java.util.LinkedList)2