use of gate.creole.annic.apache.lucene.search.BooleanQuery in project gate-core by GateNLP.
the class QueryParser method parse.
/**
* Given a query, this method parses it to convert it into one or more
* lucene queries.
* @throws gate.creole.ir.SearchException
*/
public Query[] parse(String field, String query, String baseTokenAnnotationType, String corpusID, String annotationSetToSearchIn) throws gate.creole.ir.SearchException {
this.field = field;
this.baseTokenAnnotationType = baseTokenAnnotationType;
this.position = 0;
// at the moment this supports only | operator
// it also support klene operators * and +
// implicit operator is &
// It supports simple String queries
// it supports eight kinds of tokens
// 1. String (without quotes)
// 2. "String" (with quotes)
// 3. {AnnotationType}
// 4. {AnnotationType==String}
// 5. {AnnotationType=="String"}
// 7. {AnnotationType.feature==string}
// 8. {AnnotationType.feature=="string"}
// Steps
// The query would we searched from left to right order
// returned arraylist contains queries where each query is required
// to
// be converted into the Phrase query
queries = SubQueryParser.parseQuery(query);
Query[] q = new Query[queries.size()];
for (int i = 0; i < queries.size(); i++) {
Query phraseQuery = createPhraseQuery(queries.get(i));
// boolean query
if (corpusID == null && annotationSetToSearchIn == null) {
BooleanQuery booleanQuery = new BooleanQuery();
Term t = new Term(Constants.ANNOTATION_SET_ID, Constants.COMBINED_SET);
TermQuery tQuery = new TermQuery(t);
booleanQuery.add(tQuery, false, true);
booleanQuery.add(phraseQuery, true, false);
q[i] = booleanQuery;
} else {
BooleanQuery booleanQuery = new BooleanQuery();
booleanQuery.add(phraseQuery, true, false);
if (corpusID != null) {
Term t = new Term(Constants.CORPUS_ID, corpusID);
TermQuery tQuery = new TermQuery(t);
booleanQuery.add(tQuery, true, false);
}
if (annotationSetToSearchIn != null) {
Term t = new Term(Constants.ANNOTATION_SET_ID, annotationSetToSearchIn);
TermQuery tQuery = new TermQuery(t);
booleanQuery.add(tQuery, true, false);
} else {
Term t = new Term(Constants.ANNOTATION_SET_ID, Constants.COMBINED_SET);
TermQuery tQuery = new TermQuery(t);
booleanQuery.add(tQuery, false, true);
}
q[i] = booleanQuery;
}
}
return q;
}
use of gate.creole.annic.apache.lucene.search.BooleanQuery in project gate-core by GateNLP.
the class StatsCalculator method freq.
/**
* Allows retriving frequencies for the given parameters. Please make
* sure that you close the searcher on your own. Failing to do so may
* result into many files being opened at the same time and that can
* cause the problem with your OS.
* @throws SearchException
*/
public static int freq(IndexSearcher searcher, String corpusToSearchIn, String annotationSetToSearchIn, String annotationType, String featureName, String value) throws SearchException {
try {
corpusToSearchIn = corpusToSearchIn == null || corpusToSearchIn.trim().length() == 0 ? null : corpusToSearchIn.trim();
annotationSetToSearchIn = annotationSetToSearchIn == null || annotationSetToSearchIn.trim().length() == 0 ? null : annotationSetToSearchIn.trim();
if (annotationType == null)
throw new SearchException("Annotation Type cannot be null");
// term that contains a value to be searched in the index
Term term = null;
if (featureName == null && value == null) {
term = new Term("contents", annotationType, "*");
} else if (featureName != null && value == null) {
term = new Term("contents", annotationType + "." + featureName, "**");
} else if (featureName == null) {
throw new SearchException("FeatureName cannot be null");
} else {
term = new Term("contents", value, annotationType + "." + featureName);
}
// term query
TermQuery tq = new TermQuery(term);
// indicates whether we want to use booleanQuery
boolean useBooleanQuery = false;
BooleanQuery bq = new BooleanQuery();
if (corpusToSearchIn != null) {
PhraseQuery cq = new PhraseQuery();
cq.add(new Term(Constants.CORPUS_ID, corpusToSearchIn), 0, true);
bq.add(cq, true, false);
useBooleanQuery = true;
}
if (annotationSetToSearchIn != null) {
PhraseQuery aq = new PhraseQuery();
aq.add(new Term(Constants.ANNOTATION_SET_ID, annotationSetToSearchIn), 0, true);
bq.add(aq, true, false);
useBooleanQuery = true;
}
Hits corpusHits = null;
if (useBooleanQuery) {
bq.add(tq, true, false);
corpusHits = searcher.search(bq);
} else {
corpusHits = searcher.search(tq);
}
List<?>[] firstTermPositions = searcher.getFirstTermPositions();
// if no result available, set null to our scores
if (firstTermPositions[0].size() == 0) {
return 0;
}
int size = 0;
// information
for (int hitIndex = 0; hitIndex < corpusHits.length(); hitIndex++) {
int index = firstTermPositions[0].indexOf(new Integer(corpusHits.id(hitIndex)));
// we fetch all the first term positions for the query
// issued
Integer freq = (Integer) firstTermPositions[4].get(index);
size += freq.intValue();
}
return size;
} catch (IOException ioe) {
throw new SearchException(ioe);
} finally {
searcher.initializeTermPositions();
}
}
use of gate.creole.annic.apache.lucene.search.BooleanQuery in project gate-core by GateNLP.
the class LuceneSearcher method getIndexedAnnotationSetNames.
/**
* This method returns a set of annotation set names that are indexed. Each
* entry has the following format:
* <p>
* corpusName;annotationSetName
* </p>
* where, the corpusName is the name of the corpus the annotationSetName
* belongs to.
*/
@Override
public String[] getIndexedAnnotationSetNames() throws SearchException {
String indexLocation;
try {
indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
} catch (URISyntaxException use) {
indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
}
annotationTypesMap = new HashMap<String, List<String>>();
Set<String> toReturn = new HashSet<String>();
try {
IndexReader reader = IndexReader.open(indexLocation);
try {
// lets first obtain stored corpora
TermEnum terms = reader.terms(new Term(Constants.ANNOTATION_SET_ID, ""));
if (terms == null) {
return new String[0];
}
// iterating over terms and finding out names of annotation sets indexed
Set<String> annotSets = new HashSet<String>();
boolean foundAnnotSet = false;
do {
Term t = terms.term();
if (t == null)
continue;
if (t.field().equals(Constants.ANNOTATION_SET_ID)) {
annotSets.add(t.text());
foundAnnotSet = true;
} else {
if (foundAnnotSet)
break;
}
} while (terms.next());
// but not all documents belong to corpora
for (String annotSet : annotSets) {
Term term = new Term(Constants.ANNOTATION_SET_ID, annotSet);
TermQuery tq = new TermQuery(term);
try {
gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(indexLocation);
try {
Hits annotSetHits = searcher.search(tq);
for (int i = 0; i < annotSetHits.length(); i++) {
Document luceneDoc = annotSetHits.doc(i);
String corpusID = luceneDoc.get(Constants.CORPUS_ID);
if (corpusID == null)
corpusID = "";
toReturn.add(corpusID + ";" + annotSet);
// lets create a boolean query
Term annotSetTerm = new Term(Constants.ANNOTATION_SET_ID, annotSet);
TermQuery atq = new TermQuery(annotSetTerm);
BooleanQuery bq = new BooleanQuery();
bq.add(tq, true, false);
bq.add(atq, true, false);
gate.creole.annic.apache.lucene.search.Searcher indexFeatureSearcher = new IndexSearcher(indexLocation);
try {
Hits indexFeaturesHits = searcher.search(bq);
for (int j = 0; j < indexFeaturesHits.length(); j++) {
Document aDoc = indexFeaturesHits.doc(j);
String indexedFeatures = aDoc.get(Constants.INDEXED_FEATURES);
if (indexedFeatures != null) {
String[] features = indexedFeatures.split(";");
for (String aFeature : features) {
// AnnotationType.FeatureName
int index = aFeature.indexOf(".");
if (index == -1) {
continue;
}
String type = aFeature.substring(0, index);
String featureName = aFeature.substring(index + 1);
String key = corpusID + ";" + annotSet + ";" + type;
List<String> listOfFeatures = annotationTypesMap.get(key);
if (listOfFeatures == null) {
listOfFeatures = new ArrayList<String>();
annotationTypesMap.put(key, listOfFeatures);
}
if (!listOfFeatures.contains(featureName)) {
listOfFeatures.add(featureName);
}
}
}
}
} finally {
indexFeatureSearcher.close();
}
}
} finally {
searcher.close();
}
} catch (IOException ioe) {
ioe.printStackTrace();
throw new SearchException(ioe);
}
}
} finally {
reader.close();
}
} catch (IOException ioe) {
throw new SearchException(ioe);
}
return toReturn.toArray(new String[0]);
}
Aggregations