use of gate.creole.annic.apache.lucene.search.TermQuery in project gate-core by GateNLP.
the class LuceneIndexer method getNamesOfSerializedFiles.
/**
* This method returns a set of annotation set names that are indexed.
*/
public Set<String> getNamesOfSerializedFiles(String documentID) throws IndexException {
String location = null;
try {
location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
} catch (URISyntaxException use) {
location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
}
Set<String> toReturn = new HashSet<String>();
try {
Term term = new Term(Constants.DOCUMENT_ID, documentID);
TermQuery tq = new TermQuery(term);
gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(location);
try {
// and now execute the query
// result of which will be stored in hits
Hits luceneHits = searcher.search(tq);
for (int i = 0; i < luceneHits.length(); i++) {
Document luceneDoc = luceneHits.doc(i);
String documentIdOfSerializedFile = luceneDoc.get(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE);
toReturn.add(documentIdOfSerializedFile);
}
return toReturn;
} finally {
searcher.close();
}
} catch (IOException ioe) {
throw new IndexException(ioe);
}
}
use of gate.creole.annic.apache.lucene.search.TermQuery in project gate-core by GateNLP.
the class QueryParser method parse.
/**
* Given a query, this method parses it to convert it into one or more
* lucene queries.
* @throws gate.creole.ir.SearchException
*/
public Query[] parse(String field, String query, String baseTokenAnnotationType, String corpusID, String annotationSetToSearchIn) throws gate.creole.ir.SearchException {
this.field = field;
this.baseTokenAnnotationType = baseTokenAnnotationType;
this.position = 0;
// at the moment this supports only | operator
// it also support klene operators * and +
// implicit operator is &
// It supports simple String queries
// it supports eight kinds of tokens
// 1. String (without quotes)
// 2. "String" (with quotes)
// 3. {AnnotationType}
// 4. {AnnotationType==String}
// 5. {AnnotationType=="String"}
// 7. {AnnotationType.feature==string}
// 8. {AnnotationType.feature=="string"}
// Steps
// The query would we searched from left to right order
// returned arraylist contains queries where each query is required
// to
// be converted into the Phrase query
queries = SubQueryParser.parseQuery(query);
Query[] q = new Query[queries.size()];
for (int i = 0; i < queries.size(); i++) {
Query phraseQuery = createPhraseQuery(queries.get(i));
// boolean query
if (corpusID == null && annotationSetToSearchIn == null) {
BooleanQuery booleanQuery = new BooleanQuery();
Term t = new Term(Constants.ANNOTATION_SET_ID, Constants.COMBINED_SET);
TermQuery tQuery = new TermQuery(t);
booleanQuery.add(tQuery, false, true);
booleanQuery.add(phraseQuery, true, false);
q[i] = booleanQuery;
} else {
BooleanQuery booleanQuery = new BooleanQuery();
booleanQuery.add(phraseQuery, true, false);
if (corpusID != null) {
Term t = new Term(Constants.CORPUS_ID, corpusID);
TermQuery tQuery = new TermQuery(t);
booleanQuery.add(tQuery, true, false);
}
if (annotationSetToSearchIn != null) {
Term t = new Term(Constants.ANNOTATION_SET_ID, annotationSetToSearchIn);
TermQuery tQuery = new TermQuery(t);
booleanQuery.add(tQuery, true, false);
} else {
Term t = new Term(Constants.ANNOTATION_SET_ID, Constants.COMBINED_SET);
TermQuery tQuery = new TermQuery(t);
booleanQuery.add(tQuery, false, true);
}
q[i] = booleanQuery;
}
}
return q;
}
use of gate.creole.annic.apache.lucene.search.TermQuery in project gate-core by GateNLP.
the class QueryParser method createPhraseQuery.
/**
* This method will create each normalized query into a Phrase or Term
* query If the query has only one term to search, it will be returned
* as a TermQuery otherwise, it will be returned as the PhraseQuery
*/
private Query createPhraseQuery(String query) throws gate.creole.ir.SearchException {
// Here we play the actual trick with lucene
// For a query like {Lookup}{Token}{Person.gender=="male"}
// internally this query is converted into the following PhraseQuery
// (Lookup Token Person male)
// these are the four terms which will be searched and they should
// occur
// in this order only
// but what we need is
// a pattern where
// Lookup -> the first annotation is of type Lookup
// Token -> the second annotation type is Token
// Person male -> and the third annotation must have a type person
// and a
// feature gender with male
// that means Person and male should be considered at the same
// location
// By default lucene doesn't do this and look for a position that is
// 1
// step more than the previous one
// so it will search for the first position of Lookup
// let say it is 19 (i.e. 19th annotation in the document)
// then it would consider 20th location for Token
// 21st for Person
// 22nd for male
// but we need, 19th for Lookup, 20th for Token and 21st for both
// Person
// and Male
// so from here itself we send our choice for the Location of
// annotations in this termPositions array :-).
// isn't it a great crack?
position = 0;
PhraseQuery phQuery = new PhraseQuery();
// we will tokenize this query to convert it into different tokens
// query is like {Person}"said" "Hello" {Person.gender=="male"}
// we need to convert this into different tokens
// {Person}
// "said"
// "Hello"
// {Person.gender=="male"}
List<String> tokens = findTokens(query);
// and then convert each token into separate terms
if (tokens.size() == 1) {
List<?>[] termsPos = createTerms(tokens.get(0));
@SuppressWarnings("unchecked") List<Term> terms = (List<Term>) termsPos[0];
if (terms.size() == 1) {
if (areAllTermsTokens)
needValidation = false;
else
needValidation = true;
return new TermQuery(terms.get(0));
} else {
position = 0;
}
}
int totalTerms = 0;
boolean hadPreviousTermsAToken = true;
needValidation = false;
// and now for each token we need to create Term(s)
outer: for (int i = 0; i < tokens.size(); i++) {
List<?>[] termpositions = createTerms(tokens.get(i));
@SuppressWarnings("unchecked") List<Term> terms = (List<Term>) termpositions[0];
@SuppressWarnings("unchecked") List<Integer> pos = (List<Integer>) termpositions[1];
@SuppressWarnings("unchecked") List<Boolean> consider = (List<Boolean>) termpositions[2];
boolean allTermsTokens = true;
// lets first find out if there's any token in this terms
for (int k = 0; k < terms.size(); k++) {
Term t = terms.get(k);
if (allTermsTokens)
allTermsTokens = isBaseTokenTerm(t);
}
if (!hadPreviousTermsAToken) {
needValidation = true;
break;
}
if (!allTermsTokens) {
// we want to break here
needValidation = true;
if (i > 0)
break outer;
}
for (int k = 0; k < terms.size(); k++) {
Term t = terms.get(k);
boolean considerValue = consider.get(k).booleanValue();
phQuery.add(t, pos.get(k), considerValue);
if (considerValue)
totalTerms++;
}
hadPreviousTermsAToken = allTermsTokens;
}
phQuery.setTotalTerms(totalTerms);
return phQuery;
}
use of gate.creole.annic.apache.lucene.search.TermQuery in project gate-core by GateNLP.
the class StatsCalculator method freq.
/**
* Allows retriving frequencies for the given parameters. Please make
* sure that you close the searcher on your own. Failing to do so may
* result into many files being opened at the same time and that can
* cause the problem with your OS.
* @throws SearchException
*/
public static int freq(IndexSearcher searcher, String corpusToSearchIn, String annotationSetToSearchIn, String annotationType, String featureName, String value) throws SearchException {
try {
corpusToSearchIn = corpusToSearchIn == null || corpusToSearchIn.trim().length() == 0 ? null : corpusToSearchIn.trim();
annotationSetToSearchIn = annotationSetToSearchIn == null || annotationSetToSearchIn.trim().length() == 0 ? null : annotationSetToSearchIn.trim();
if (annotationType == null)
throw new SearchException("Annotation Type cannot be null");
// term that contains a value to be searched in the index
Term term = null;
if (featureName == null && value == null) {
term = new Term("contents", annotationType, "*");
} else if (featureName != null && value == null) {
term = new Term("contents", annotationType + "." + featureName, "**");
} else if (featureName == null) {
throw new SearchException("FeatureName cannot be null");
} else {
term = new Term("contents", value, annotationType + "." + featureName);
}
// term query
TermQuery tq = new TermQuery(term);
// indicates whether we want to use booleanQuery
boolean useBooleanQuery = false;
BooleanQuery bq = new BooleanQuery();
if (corpusToSearchIn != null) {
PhraseQuery cq = new PhraseQuery();
cq.add(new Term(Constants.CORPUS_ID, corpusToSearchIn), 0, true);
bq.add(cq, true, false);
useBooleanQuery = true;
}
if (annotationSetToSearchIn != null) {
PhraseQuery aq = new PhraseQuery();
aq.add(new Term(Constants.ANNOTATION_SET_ID, annotationSetToSearchIn), 0, true);
bq.add(aq, true, false);
useBooleanQuery = true;
}
Hits corpusHits = null;
if (useBooleanQuery) {
bq.add(tq, true, false);
corpusHits = searcher.search(bq);
} else {
corpusHits = searcher.search(tq);
}
List<?>[] firstTermPositions = searcher.getFirstTermPositions();
// if no result available, set null to our scores
if (firstTermPositions[0].size() == 0) {
return 0;
}
int size = 0;
// information
for (int hitIndex = 0; hitIndex < corpusHits.length(); hitIndex++) {
int index = firstTermPositions[0].indexOf(new Integer(corpusHits.id(hitIndex)));
// we fetch all the first term positions for the query
// issued
Integer freq = (Integer) firstTermPositions[4].get(index);
size += freq.intValue();
}
return size;
} catch (IOException ioe) {
throw new SearchException(ioe);
} finally {
searcher.initializeTermPositions();
}
}
use of gate.creole.annic.apache.lucene.search.TermQuery in project gate-core by GateNLP.
the class LuceneSearcher method search.
/**
* Method retunrs true/false indicating whether results were found or not.
*/
@SuppressWarnings("unchecked")
@Override
public boolean search(String query, Map<String, Object> parameters) throws SearchException {
luceneHits = null;
annicPatterns = new ArrayList<Pattern>();
annotationTypesMap = new HashMap<String, List<String>>();
luceneSearchThreads = new ArrayList<LuceneSearchThread>();
luceneSearchThreadIndex = 0;
success = false;
fwdIterationEnded = false;
wasDeleteQuery = false;
if (parameters == null)
throw new SearchException("Parameters cannot be null");
this.parameters = parameters;
/*
* lets first check if the query is to search the document names This is
* used when we only wants to search for documents stored under the specific
* corpus
*/
if (parameters.size() == 2 && parameters.get(Constants.INDEX_LOCATION_URL) != null) {
String corpusID = (String) parameters.get(Constants.CORPUS_ID);
String indexLocation = null;
try {
indexLocation = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
} catch (URISyntaxException use) {
indexLocation = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
}
if (corpusID != null && indexLocation != null) {
wasDeleteQuery = true;
Term term = new Term(Constants.CORPUS_ID, corpusID);
TermQuery tq = new TermQuery(term);
try {
gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(indexLocation);
// and now execute the query
// result of which will be stored in hits
luceneHits = searcher.search(tq);
success = luceneHits.length() > 0 ? true : false;
return success;
} catch (IOException ioe) {
ioe.printStackTrace();
throw new SearchException(ioe);
}
}
}
// check for index locations
if (parameters.get(Constants.INDEX_LOCATIONS) == null) {
String indexLocation;
try {
indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
} catch (URISyntaxException use) {
indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
}
ArrayList<String> indexLocations = new ArrayList<String>();
indexLocations.add(indexLocation);
parameters.put(Constants.INDEX_LOCATIONS, indexLocations);
}
indexLocations = new ArrayList<String>((List<? extends String>) parameters.get(Constants.INDEX_LOCATIONS));
if (indexLocations.size() == 0)
throw new SearchException("Corpus is not initialized");
// check for valid context window
if (parameters.get(Constants.CONTEXT_WINDOW) == null)
throw new SearchException("Parameter " + Constants.CONTEXT_WINDOW + " is not provided!");
contextWindow = ((Integer) parameters.get(Constants.CONTEXT_WINDOW)).intValue();
if (getContextWindow().intValue() <= 0)
throw new SearchException("Context Window must be atleast 1 or > 1");
if (query == null)
throw new SearchException("Query is not initialized");
this.query = query;
this.corpusToSearchIn = (String) parameters.get(Constants.CORPUS_ID);
this.annotationSetToSearchIn = (String) parameters.get(Constants.ANNOTATION_SET_ID);
annicPatterns = new ArrayList<Pattern>();
annotationTypesMap = new HashMap<String, List<String>>();
luceneSearchThreads = new ArrayList<LuceneSearchThread>();
// TODO: is this really useful or used to have several indexLocations ?
for (int indexCounter = 0; indexCounter < indexLocations.size(); indexCounter++) {
String location = indexLocations.get(indexCounter);
// we create a separate Thread for each index
LuceneSearchThread lst = new LuceneSearchThread();
if (lst.search(query, contextWindow, location, corpusToSearchIn, annotationSetToSearchIn, this)) {
luceneSearchThreads.add(lst);
}
}
success = luceneSearchThreads.size() > 0 ? true : false;
return success;
}
Aggregations