use of org.apache.lucene.queryparser.classic.QueryParser in project searchcode-server by boyter.
the class CodeSearcher method getByCodeId.
/**
* Only used as fallback if getByRepoFileName fails for some reason due to what appears to be a lucene index bug
* this should always work as the path used is sha1 and should be unique for anything the current codebase can
* deal with
*/
public CodeResult getByCodeId(String codeId) {
CodeResult codeResult = null;
try {
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(this.INDEXPATH)));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new CodeAnalyzer();
QueryParser parser = new QueryParser(CODEFIELD, analyzer);
Query query = parser.parse(Values.CODEID + ":" + QueryParser.escape(codeId));
Singleton.getLogger().info("Query to get by " + Values.CODEID + ":" + QueryParser.escape(codeId));
TopDocs results = searcher.search(query, 1);
ScoreDoc[] hits = results.scoreDocs;
if (hits.length != 0) {
Document doc = searcher.doc(hits[0].doc);
String filepath = doc.get(Values.PATH);
List<String> code = new ArrayList<>();
try {
code = Singleton.getHelpers().readFileLinesGuessEncoding(filepath, Singleton.getHelpers().tryParseInt(Properties.getProperties().getProperty(Values.MAXFILELINEDEPTH, Values.DEFAULTMAXFILELINEDEPTH), Values.DEFAULTMAXFILELINEDEPTH));
} catch (Exception ex) {
Singleton.getLogger().info("Indexed file appears to binary: " + filepath);
}
codeResult = new CodeResult(code, null);
codeResult.setFilePath(filepath);
codeResult.setCodePath(doc.get(Values.FILELOCATIONFILENAME));
codeResult.setFileName(doc.get(Values.FILENAME));
codeResult.setLanguageName(doc.get(Values.LANGUAGENAME));
codeResult.setMd5hash(doc.get(Values.MD5HASH));
codeResult.setCodeLines(doc.get(Values.CODELINES));
codeResult.setDocumentId(hits[0].doc);
codeResult.setRepoName(doc.get(Values.REPONAME));
codeResult.setRepoLocation(doc.get(Values.REPOLOCATION));
codeResult.setCodeOwner(doc.get(Values.CODEOWNER));
codeResult.setCodeId(doc.get(Values.CODEID));
}
reader.close();
} catch (Exception ex) {
LOGGER.severe(" caught a " + ex.getClass() + "\n with message: " + ex.getMessage());
}
return codeResult;
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class TRECScenarioRunnable method run.
@SuppressWarnings("deprecation")
@Override
public void run() {
LOG.info("Running TRECScenarioSearcher Thread for " + thisInterestProfile.topicIndex);
try {
// When the thread wakes up at a new day, clear pushed tweets
if ((scenario.equals("A") && Calendar.getInstance(TimeZone.getTimeZone("UTC")).get(Calendar.DAY_OF_YEAR) != now.get(Calendar.DAY_OF_YEAR)) || (scenario.equals("B")))
pushedTweets.clear();
Query titleQuery = new QueryParser(TRECIndexerRunnable.StatusField.TEXT.name, Indexer.ANALYZER).parse(thisInterestProfile.titleQueryString());
LOG.info("Parsed titleQuery " + titleQuery.getClass() + " looks like " + titleQuery.toString() + " " + titleQuery.getClass());
reader = DirectoryReader.open(FSDirectory.open(new File(indexPath).toPath()));
IndexReader newReader = DirectoryReader.openIfChanged((DirectoryReader) reader);
if (newReader != null) {
reader.close();
reader = newReader;
}
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(titleCoordSimilarity);
// Get the total number of hits
TotalHitCountCollector totalHitCollector = new TotalHitCountCollector();
// First search and scoring part: titleCoordSimilarity(q,d) = Nt/T
searcher.search(titleQuery, totalHitCollector);
// Create a collector for these hits
if (totalHitCollector.getTotalHits() > 0) {
TopScoreDocCollector titleQueryHitCollector = TopScoreDocCollector.create(Math.max(0, totalHitCollector.getTotalHits()));
searcher.search(titleQuery, titleQueryHitCollector);
ScoreDoc[] coordHits = titleQueryHitCollector.topDocs().scoreDocs;
HashMap<Integer, Float> coordHMap = new HashMap<Integer, Float>();
for (ScoreDoc s : coordHits) {
coordHMap.put(s.doc, s.score);
}
LOG.info("Title coordinate similarity has " + totalHitCollector.getTotalHits() + " hits");
Query titleExpansionQuery = new QueryParser(TRECIndexerRunnable.StatusField.TEXT.name, Indexer.ANALYZER).parse(thisInterestProfile.titleExpansionQueryString(titleBoostFactor, expansionBoostFactor));
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
bqBuilder.add(titleExpansionQuery, BooleanClause.Occur.MUST);
Query tweetTimeRangeQuery = LongPoint.newRangeQuery(StatusField.EPOCH.name, (long) (Calendar.getInstance().getTimeInMillis() - interval) / 1000, (long) Calendar.getInstance().getTimeInMillis() / 1000);
// must satisfy the time window, FILTER clause do not
// participate in scoring
bqBuilder.add(tweetTimeRangeQuery, BooleanClause.Occur.FILTER);
Query q = bqBuilder.build();
LOG.info("Parsed titleExpansionQuery " + titleExpansionQuery.getClass() + " looks like " + titleExpansionQuery.toString() + " " + titleExpansionQuery.getClass());
LOG.info("Parsed finalQuery " + q.getClass() + " looks like " + q.toString() + " " + q.getClass());
searcher.setSimilarity(titleExpansionSimilarity);
totalHitCollector = new TotalHitCountCollector();
// Second search and scoring part:
// titleExpansionSimilarity(q,d)= (We*Ne+Wt*Nt)
searcher.search(q, totalHitCollector);
if (totalHitCollector.getTotalHits() > 0) {
TopScoreDocCollector finalQueryHitCollector = TopScoreDocCollector.create(Math.max(0, totalHitCollector.getTotalHits()));
searcher.search(q, finalQueryHitCollector);
ScoreDoc[] hits = finalQueryHitCollector.topDocs().scoreDocs;
LOG.info("Title expansion similarity has " + totalHitCollector.getTotalHits() + " hits");
// Re-score (titleExpansionSimilarity multiplied by
// titleCoordSimilarity)
// Sort by final score and timestamp (descending order)
ArrayList<ScoreDocTimestamp> finalHits = new ArrayList<ScoreDocTimestamp>();
for (int j = 0; j < hits.length; ++j) {
int docId = hits[j].doc;
if (coordHMap.containsKey(docId)) {
float docScore = hits[j].score;
Document fullDocument = searcher.doc(docId);
long timestamp = Long.parseLong(fullDocument.get(TRECIndexerRunnable.StatusField.EPOCH.name));
finalHits.add(new ScoreDocTimestamp(docId, docScore * coordHMap.get(docId), timestamp, fullDocument));
}
}
Collections.sort(finalHits, new ScoreDocComparator());
LOG.info("Hit " + finalHits.size() + " documents");
if (0 != finalHits.size()) {
LOG.info("Quering:" + titleExpansionQuery.toString() + ", Found " + finalHits.size() + " hits");
}
ArrayList<String> tweetList = new ArrayList<String>();
HashMap<String, Float> scoreMap = new HashMap<String, Float>();
for (int j = 0; j < finalHits.size(); ++j) {
int docId = finalHits.get(j).doc;
Document d = finalHits.get(j).fullDocument;
if (pushedTweets.size() < dailylimit && !pushedTweets.containsKey(d.get(TRECIndexerRunnable.StatusField.ID.name)) && !isDuplicate(d.get(TRECIndexerRunnable.StatusField.TEXT.name)) && finalHits.get(j).score >= 6) {
LOG.info(searcher.explain(titleExpansionQuery, docId).toString());
LOG.info("Multiplied by " + coordHMap.get(docId) + " Final score " + finalHits.get(j).score);
LOG.info("Raw text " + d.get(TRECIndexerRunnable.StatusField.RAW_TEXT.name) + " " + thisInterestProfile.queryTokenCount);
tweetList.add(d.get(TRECIndexerRunnable.StatusField.ID.name));
scoreMap.put(d.get(TRECIndexerRunnable.StatusField.ID.name), finalHits.get(j).score);
LOG.info("Tweet ID:" + String.valueOf(d.get(TRECIndexerRunnable.StatusField.ID.name)));
pushedTweets.put(d.get(TRECIndexerRunnable.StatusField.ID.name), d.get(TRECIndexerRunnable.StatusField.TEXT.name));
}
if (scenario.equals("A") && (pushedTweets.size() >= dailylimit)) {
shutDown = true;
break;
}
}
if (tweetList.size() > 0) {
if (scenario.equals("A"))
postTweetListScenarioA(tweetList, api);
else if (scenario.equals("B"))
postTweetListScenarioB(tweetList, api, scoreMap);
} else {
LOG.info("Nothing interesting today, Gonna sleep for regular interval");
}
}
} else {
LOG.info("For this iteration, no single tweet hit even only the title field");
}
if (scenario.equals("A") && !shutDown) {
now = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
}
if (scenario.equals("A") && shutDown) {
now = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
Calendar tomorrow = Calendar.getInstance();
tomorrow.set(Calendar.HOUR, 0);
tomorrow.set(Calendar.MINUTE, 0);
tomorrow.set(Calendar.SECOND, 0);
tomorrow.set(Calendar.AM_PM, Calendar.AM);
tomorrow.set(Calendar.DAY_OF_YEAR, now.get(Calendar.DAY_OF_YEAR) + 1);
tomorrow.setTimeZone(TimeZone.getTimeZone("UTC"));
LOG.info("Reached dailyLimit, sleep for the rest of the day");
LOG.info(tomorrow.getTimeInMillis() + " " + now.getTimeInMillis());
Thread.sleep((long) tomorrow.getTimeInMillis() - now.getTimeInMillis() + 60000);
now = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
shutDown = false;
LOG.info("Woke up at this new day!");
pushedTweets.clear();
}
reader.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParseException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class TweetSearcherAPI method search.
@POST
@Path("search")
@Produces(MediaType.APPLICATION_JSON)
public List<SearchResult> search(SearchAPIQuery query) {
try {
Query q = new QueryParser(TweetStreamIndexer.StatusField.TEXT.name, TweetSearcher.ANALYZER).parse(query.getQuery());
try {
reader = DirectoryReader.open(TweetSearcher.indexWriter, true, true);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
IndexReader newReader = DirectoryReader.openIfChanged((DirectoryReader) reader, TweetSearcher.indexWriter, true);
if (newReader != null) {
reader.close();
reader = newReader;
}
IndexSearcher searcher = new IndexSearcher(reader);
int topN = query.getCount();
TopScoreDocCollector collector = TopScoreDocCollector.create(topN);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
List<SearchResult> resultHits = new ArrayList<>();
for (int i = 0; i < hits.length && i < topN; ++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
resultHits.add(new SearchResult(String.valueOf(d.get(TweetStreamIndexer.StatusField.ID.name))));
}
return resultHits;
} catch (Exception e) {
e.printStackTrace();
return new ArrayList<>();
}
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class TweetServlet method doGet.
@Override
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
if (request.getRequestURI().equals("/search")) {
response.setStatus(HttpServletResponse.SC_OK);
response.setContentType("text/html");
request.setCharacterEncoding("UTF-8");
Query q;
try {
q = new QueryParser(StatusField.TEXT.name, TweetSearcher.ANALYZER).parse(request.getParameter("query"));
try {
reader = DirectoryReader.open(TweetSearcher.indexWriter, true, true);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
IndexReader newReader = DirectoryReader.openIfChanged((DirectoryReader) reader, TweetSearcher.indexWriter, true);
if (newReader != null) {
reader.close();
reader = newReader;
}
IndexSearcher searcher = new IndexSearcher(reader);
int topN;
if (request.getParameter("top") != null) {
topN = Integer.parseInt(request.getParameter("top"));
} else {
// TODO configurable, default(parameter unspecified in url) topN = 20
topN = 20;
}
TopScoreDocCollector collector = TopScoreDocCollector.create(topN);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
TweetHits tweetHits = new TweetHits(request.getParameter("query"), hits.length);
for (int i = 0; i < hits.length; ++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
tweetHits.addHit(i, String.valueOf(d.get(StatusField.ID.name)));
}
MustacheFactory mf = new DefaultMustacheFactory();
Mustache mustache = mf.compile(MustacheTemplatePath);
mustache.execute(response.getWriter(), tweetHits).flush();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
response.setStatus(HttpServletResponse.SC_NOT_FOUND);
}
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class IdfPassageScorer method getTermIdfJSON.
@Override
public JSONObject getTermIdfJSON(List<String> sentList) {
// EnglishAnalyzer ea = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea);
ClassicSimilarity similarity = new ClassicSimilarity();
for (String sent : sentList) {
String[] thisSentence = sent.trim().split("\\s+");
for (String term : thisSentence) {
try {
TermQuery q = (TermQuery) qp.parse(term);
Term t = q.getTerm();
double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
termIdfMap.put(term, String.valueOf(termIDF));
} catch (Exception e) {
continue;
}
}
}
return new JSONObject(termIdfMap);
}
Aggregations