Search in sources :

Example 26 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project searchcode-server by boyter.

the class CodeSearcher method getByCodeId.

/**
     * Only used as fallback if getByRepoFileName fails for some reason due to what appears to be a lucene index bug
     * this should always work as the path used is sha1 and should be unique for anything the current codebase can
     * deal with
     */
public CodeResult getByCodeId(String codeId) {
    CodeResult codeResult = null;
    try {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(this.INDEXPATH)));
        IndexSearcher searcher = new IndexSearcher(reader);
        Analyzer analyzer = new CodeAnalyzer();
        QueryParser parser = new QueryParser(CODEFIELD, analyzer);
        Query query = parser.parse(Values.CODEID + ":" + QueryParser.escape(codeId));
        Singleton.getLogger().info("Query to get by " + Values.CODEID + ":" + QueryParser.escape(codeId));
        TopDocs results = searcher.search(query, 1);
        ScoreDoc[] hits = results.scoreDocs;
        if (hits.length != 0) {
            Document doc = searcher.doc(hits[0].doc);
            String filepath = doc.get(Values.PATH);
            List<String> code = new ArrayList<>();
            try {
                code = Singleton.getHelpers().readFileLinesGuessEncoding(filepath, Singleton.getHelpers().tryParseInt(Properties.getProperties().getProperty(Values.MAXFILELINEDEPTH, Values.DEFAULTMAXFILELINEDEPTH), Values.DEFAULTMAXFILELINEDEPTH));
            } catch (Exception ex) {
                Singleton.getLogger().info("Indexed file appears to binary: " + filepath);
            }
            codeResult = new CodeResult(code, null);
            codeResult.setFilePath(filepath);
            codeResult.setCodePath(doc.get(Values.FILELOCATIONFILENAME));
            codeResult.setFileName(doc.get(Values.FILENAME));
            codeResult.setLanguageName(doc.get(Values.LANGUAGENAME));
            codeResult.setMd5hash(doc.get(Values.MD5HASH));
            codeResult.setCodeLines(doc.get(Values.CODELINES));
            codeResult.setDocumentId(hits[0].doc);
            codeResult.setRepoName(doc.get(Values.REPONAME));
            codeResult.setRepoLocation(doc.get(Values.REPOLOCATION));
            codeResult.setCodeOwner(doc.get(Values.CODEOWNER));
            codeResult.setCodeId(doc.get(Values.CODEID));
        }
        reader.close();
    } catch (Exception ex) {
        LOGGER.severe(" caught a " + ex.getClass() + "\n with message: " + ex.getMessage());
    }
    return codeResult;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Query(org.apache.lucene.search.Query) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) IOException(java.io.IOException) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) IndexReader(org.apache.lucene.index.IndexReader)

Example 27 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class TRECScenarioRunnable method run.

@SuppressWarnings("deprecation")
@Override
public void run() {
    LOG.info("Running TRECScenarioSearcher Thread for " + thisInterestProfile.topicIndex);
    try {
        // When the thread wakes up at a new day, clear pushed tweets
        if ((scenario.equals("A") && Calendar.getInstance(TimeZone.getTimeZone("UTC")).get(Calendar.DAY_OF_YEAR) != now.get(Calendar.DAY_OF_YEAR)) || (scenario.equals("B")))
            pushedTweets.clear();
        Query titleQuery = new QueryParser(TRECIndexerRunnable.StatusField.TEXT.name, Indexer.ANALYZER).parse(thisInterestProfile.titleQueryString());
        LOG.info("Parsed titleQuery " + titleQuery.getClass() + " looks like " + titleQuery.toString() + " " + titleQuery.getClass());
        reader = DirectoryReader.open(FSDirectory.open(new File(indexPath).toPath()));
        IndexReader newReader = DirectoryReader.openIfChanged((DirectoryReader) reader);
        if (newReader != null) {
            reader.close();
            reader = newReader;
        }
        IndexSearcher searcher = new IndexSearcher(reader);
        searcher.setSimilarity(titleCoordSimilarity);
        // Get the total number of hits
        TotalHitCountCollector totalHitCollector = new TotalHitCountCollector();
        // First search and scoring part: titleCoordSimilarity(q,d) = Nt/T
        searcher.search(titleQuery, totalHitCollector);
        // Create a collector for these hits
        if (totalHitCollector.getTotalHits() > 0) {
            TopScoreDocCollector titleQueryHitCollector = TopScoreDocCollector.create(Math.max(0, totalHitCollector.getTotalHits()));
            searcher.search(titleQuery, titleQueryHitCollector);
            ScoreDoc[] coordHits = titleQueryHitCollector.topDocs().scoreDocs;
            HashMap<Integer, Float> coordHMap = new HashMap<Integer, Float>();
            for (ScoreDoc s : coordHits) {
                coordHMap.put(s.doc, s.score);
            }
            LOG.info("Title coordinate similarity has " + totalHitCollector.getTotalHits() + " hits");
            Query titleExpansionQuery = new QueryParser(TRECIndexerRunnable.StatusField.TEXT.name, Indexer.ANALYZER).parse(thisInterestProfile.titleExpansionQueryString(titleBoostFactor, expansionBoostFactor));
            BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
            bqBuilder.add(titleExpansionQuery, BooleanClause.Occur.MUST);
            Query tweetTimeRangeQuery = LongPoint.newRangeQuery(StatusField.EPOCH.name, (long) (Calendar.getInstance().getTimeInMillis() - interval) / 1000, (long) Calendar.getInstance().getTimeInMillis() / 1000);
            // must satisfy the time window, FILTER clause do not
            // participate in scoring
            bqBuilder.add(tweetTimeRangeQuery, BooleanClause.Occur.FILTER);
            Query q = bqBuilder.build();
            LOG.info("Parsed titleExpansionQuery " + titleExpansionQuery.getClass() + " looks like " + titleExpansionQuery.toString() + " " + titleExpansionQuery.getClass());
            LOG.info("Parsed finalQuery " + q.getClass() + " looks like " + q.toString() + " " + q.getClass());
            searcher.setSimilarity(titleExpansionSimilarity);
            totalHitCollector = new TotalHitCountCollector();
            // Second search and scoring part:
            // titleExpansionSimilarity(q,d)= (We*Ne+Wt*Nt)
            searcher.search(q, totalHitCollector);
            if (totalHitCollector.getTotalHits() > 0) {
                TopScoreDocCollector finalQueryHitCollector = TopScoreDocCollector.create(Math.max(0, totalHitCollector.getTotalHits()));
                searcher.search(q, finalQueryHitCollector);
                ScoreDoc[] hits = finalQueryHitCollector.topDocs().scoreDocs;
                LOG.info("Title expansion similarity has " + totalHitCollector.getTotalHits() + " hits");
                // Re-score (titleExpansionSimilarity multiplied by
                // titleCoordSimilarity)
                // Sort by final score and timestamp (descending order)
                ArrayList<ScoreDocTimestamp> finalHits = new ArrayList<ScoreDocTimestamp>();
                for (int j = 0; j < hits.length; ++j) {
                    int docId = hits[j].doc;
                    if (coordHMap.containsKey(docId)) {
                        float docScore = hits[j].score;
                        Document fullDocument = searcher.doc(docId);
                        long timestamp = Long.parseLong(fullDocument.get(TRECIndexerRunnable.StatusField.EPOCH.name));
                        finalHits.add(new ScoreDocTimestamp(docId, docScore * coordHMap.get(docId), timestamp, fullDocument));
                    }
                }
                Collections.sort(finalHits, new ScoreDocComparator());
                LOG.info("Hit " + finalHits.size() + " documents");
                if (0 != finalHits.size()) {
                    LOG.info("Quering:" + titleExpansionQuery.toString() + ", Found " + finalHits.size() + " hits");
                }
                ArrayList<String> tweetList = new ArrayList<String>();
                HashMap<String, Float> scoreMap = new HashMap<String, Float>();
                for (int j = 0; j < finalHits.size(); ++j) {
                    int docId = finalHits.get(j).doc;
                    Document d = finalHits.get(j).fullDocument;
                    if (pushedTweets.size() < dailylimit && !pushedTweets.containsKey(d.get(TRECIndexerRunnable.StatusField.ID.name)) && !isDuplicate(d.get(TRECIndexerRunnable.StatusField.TEXT.name)) && finalHits.get(j).score >= 6) {
                        LOG.info(searcher.explain(titleExpansionQuery, docId).toString());
                        LOG.info("Multiplied by " + coordHMap.get(docId) + " Final score " + finalHits.get(j).score);
                        LOG.info("Raw text " + d.get(TRECIndexerRunnable.StatusField.RAW_TEXT.name) + " " + thisInterestProfile.queryTokenCount);
                        tweetList.add(d.get(TRECIndexerRunnable.StatusField.ID.name));
                        scoreMap.put(d.get(TRECIndexerRunnable.StatusField.ID.name), finalHits.get(j).score);
                        LOG.info("Tweet ID:" + String.valueOf(d.get(TRECIndexerRunnable.StatusField.ID.name)));
                        pushedTweets.put(d.get(TRECIndexerRunnable.StatusField.ID.name), d.get(TRECIndexerRunnable.StatusField.TEXT.name));
                    }
                    if (scenario.equals("A") && (pushedTweets.size() >= dailylimit)) {
                        shutDown = true;
                        break;
                    }
                }
                if (tweetList.size() > 0) {
                    if (scenario.equals("A"))
                        postTweetListScenarioA(tweetList, api);
                    else if (scenario.equals("B"))
                        postTweetListScenarioB(tweetList, api, scoreMap);
                } else {
                    LOG.info("Nothing interesting today, Gonna sleep for regular interval");
                }
            }
        } else {
            LOG.info("For this iteration, no single tweet hit even only the title field");
        }
        if (scenario.equals("A") && !shutDown) {
            now = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
        }
        if (scenario.equals("A") && shutDown) {
            now = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
            Calendar tomorrow = Calendar.getInstance();
            tomorrow.set(Calendar.HOUR, 0);
            tomorrow.set(Calendar.MINUTE, 0);
            tomorrow.set(Calendar.SECOND, 0);
            tomorrow.set(Calendar.AM_PM, Calendar.AM);
            tomorrow.set(Calendar.DAY_OF_YEAR, now.get(Calendar.DAY_OF_YEAR) + 1);
            tomorrow.setTimeZone(TimeZone.getTimeZone("UTC"));
            LOG.info("Reached dailyLimit, sleep for the rest of the day");
            LOG.info(tomorrow.getTimeInMillis() + " " + now.getTimeInMillis());
            Thread.sleep((long) tomorrow.getTimeInMillis() - now.getTimeInMillis() + 60000);
            now = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
            shutDown = false;
            LOG.info("Woke up at this new day!");
            pushedTweets.clear();
        }
        reader.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (ParseException e1) {
        // TODO Auto-generated catch block
        e1.printStackTrace();
    } catch (InterruptedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : ClientBuilder(javax.ws.rs.client.ClientBuilder) Document(org.apache.lucene.document.Document) LongPoint(org.apache.lucene.document.LongPoint) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) IndexReader(org.apache.lucene.index.IndexReader) ParseException(org.apache.lucene.queryparser.classic.ParseException)

Example 28 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class TweetSearcherAPI method search.

@POST
@Path("search")
@Produces(MediaType.APPLICATION_JSON)
public List<SearchResult> search(SearchAPIQuery query) {
    try {
        Query q = new QueryParser(TweetStreamIndexer.StatusField.TEXT.name, TweetSearcher.ANALYZER).parse(query.getQuery());
        try {
            reader = DirectoryReader.open(TweetSearcher.indexWriter, true, true);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        IndexReader newReader = DirectoryReader.openIfChanged((DirectoryReader) reader, TweetSearcher.indexWriter, true);
        if (newReader != null) {
            reader.close();
            reader = newReader;
        }
        IndexSearcher searcher = new IndexSearcher(reader);
        int topN = query.getCount();
        TopScoreDocCollector collector = TopScoreDocCollector.create(topN);
        searcher.search(q, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;
        List<SearchResult> resultHits = new ArrayList<>();
        for (int i = 0; i < hits.length && i < topN; ++i) {
            int docId = hits[i].doc;
            Document d = searcher.doc(docId);
            resultHits.add(new SearchResult(String.valueOf(d.get(TweetStreamIndexer.StatusField.ID.name))));
        }
        return resultHits;
    } catch (Exception e) {
        e.printStackTrace();
        return new ArrayList<>();
    }
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Query(org.apache.lucene.search.Query) TopScoreDocCollector(org.apache.lucene.search.TopScoreDocCollector) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) IOException(java.io.IOException) ScoreDoc(org.apache.lucene.search.ScoreDoc) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) IndexReader(org.apache.lucene.index.IndexReader) Path(javax.ws.rs.Path) POST(javax.ws.rs.POST) Produces(javax.ws.rs.Produces)

Example 29 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class TweetServlet method doGet.

@Override
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    if (request.getRequestURI().equals("/search")) {
        response.setStatus(HttpServletResponse.SC_OK);
        response.setContentType("text/html");
        request.setCharacterEncoding("UTF-8");
        Query q;
        try {
            q = new QueryParser(StatusField.TEXT.name, TweetSearcher.ANALYZER).parse(request.getParameter("query"));
            try {
                reader = DirectoryReader.open(TweetSearcher.indexWriter, true, true);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            IndexReader newReader = DirectoryReader.openIfChanged((DirectoryReader) reader, TweetSearcher.indexWriter, true);
            if (newReader != null) {
                reader.close();
                reader = newReader;
            }
            IndexSearcher searcher = new IndexSearcher(reader);
            int topN;
            if (request.getParameter("top") != null) {
                topN = Integer.parseInt(request.getParameter("top"));
            } else {
                // TODO configurable, default(parameter unspecified in url) topN = 20
                topN = 20;
            }
            TopScoreDocCollector collector = TopScoreDocCollector.create(topN);
            searcher.search(q, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;
            TweetHits tweetHits = new TweetHits(request.getParameter("query"), hits.length);
            for (int i = 0; i < hits.length; ++i) {
                int docId = hits[i].doc;
                Document d = searcher.doc(docId);
                tweetHits.addHit(i, String.valueOf(d.get(StatusField.ID.name)));
            }
            MustacheFactory mf = new DefaultMustacheFactory();
            Mustache mustache = mf.compile(MustacheTemplatePath);
            mustache.execute(response.getWriter(), tweetHits).flush();
        } catch (ParseException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    } else {
        response.setStatus(HttpServletResponse.SC_NOT_FOUND);
    }
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Query(org.apache.lucene.search.Query) TopScoreDocCollector(org.apache.lucene.search.TopScoreDocCollector) DefaultMustacheFactory(com.github.mustachejava.DefaultMustacheFactory) Mustache(com.github.mustachejava.Mustache) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) ScoreDoc(org.apache.lucene.search.ScoreDoc) DefaultMustacheFactory(com.github.mustachejava.DefaultMustacheFactory) MustacheFactory(com.github.mustachejava.MustacheFactory) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) IndexReader(org.apache.lucene.index.IndexReader) ParseException(org.apache.lucene.queryparser.classic.ParseException)

Example 30 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class IdfPassageScorer method getTermIdfJSON.

@Override
public JSONObject getTermIdfJSON(List<String> sentList) {
    //    EnglishAnalyzer ea = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
    EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
    QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea);
    ClassicSimilarity similarity = new ClassicSimilarity();
    for (String sent : sentList) {
        String[] thisSentence = sent.trim().split("\\s+");
        for (String term : thisSentence) {
            try {
                TermQuery q = (TermQuery) qp.parse(term);
                Term t = q.getTerm();
                double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
                termIdfMap.put(term, String.valueOf(termIDF));
            } catch (Exception e) {
                continue;
            }
        }
    }
    return new JSONObject(termIdfMap);
}
Also used : ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) TermQuery(org.apache.lucene.search.TermQuery) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) JSONObject(org.json.JSONObject) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) Term(org.apache.lucene.index.Term)

Aggregations

QueryParser (org.apache.lucene.queryparser.classic.QueryParser)67 Query (org.apache.lucene.search.Query)46 IndexSearcher (org.apache.lucene.search.IndexSearcher)30 Document (org.apache.lucene.document.Document)25 IOException (java.io.IOException)19 Analyzer (org.apache.lucene.analysis.Analyzer)19 IndexReader (org.apache.lucene.index.IndexReader)18 TopDocs (org.apache.lucene.search.TopDocs)18 ScoreDoc (org.apache.lucene.search.ScoreDoc)17 ArrayList (java.util.ArrayList)14 BooleanQuery (org.apache.lucene.search.BooleanQuery)14 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)13 ParseException (org.apache.lucene.queryparser.classic.ParseException)12 TermQuery (org.apache.lucene.search.TermQuery)11 Term (org.apache.lucene.index.Term)6 MatchAllDocsQuery (org.apache.lucene.search.MatchAllDocsQuery)6 WildcardQuery (org.apache.lucene.search.WildcardQuery)6 EnglishAnalyzer (org.apache.lucene.analysis.en.EnglishAnalyzer)5 IndexWriter (org.apache.lucene.index.IndexWriter)5 ScoredDocuments (io.anserini.rerank.ScoredDocuments)4