use of gate.creole.annic.SearchException in project gate-core by GateNLP.
the class LuceneSearcher method next.
/**
* Return the next numberOfHits -1 indicates all
*/
@Override
public Hit[] next(int numberOfHits) throws SearchException {
annicPatterns = new ArrayList<Pattern>();
if (!success) {
this.annicPatterns = new ArrayList<Pattern>();
return getHits();
}
if (fwdIterationEnded) {
this.annicPatterns = new ArrayList<Pattern>();
return getHits();
}
try {
if (wasDeleteQuery) {
List<String> docIDs = new ArrayList<String>();
List<String> setNames = new ArrayList<String>();
for (int i = 0; i < luceneHits.length(); i++) {
Document luceneDoc = luceneHits.doc(i);
String documentID = luceneDoc.get(Constants.DOCUMENT_ID);
String annotationSetID = luceneDoc.get(Constants.ANNOTATION_SET_ID);
int index = docIDs.indexOf(documentID);
if (index == -1) {
docIDs.add(documentID);
setNames.add(annotationSetID);
} else {
if (!setNames.get(index).equals(annotationSetID)) {
docIDs.add(documentID);
setNames.add(annotationSetID);
}
}
}
Hit[] toReturn = new Hit[docIDs.size()];
for (int i = 0; i < toReturn.length; i++) {
toReturn[i] = new Hit(docIDs.get(i), setNames.get(i), 0, 0, "");
}
return toReturn;
}
for (; luceneSearchThreadIndex < luceneSearchThreads.size(); luceneSearchThreadIndex++) {
LuceneSearchThread lst = luceneSearchThreads.get(luceneSearchThreadIndex);
List<Pattern> results = lst.next(numberOfHits);
if (results != null) {
if (numberOfHits != -1) {
numberOfHits -= results.size();
}
this.annicPatterns.addAll(results);
if (numberOfHits == 0) {
return getHits();
}
}
}
// if we are here, there wer no sufficient patterns available
// so what we do is make success to false so that this method
// return null on next call
fwdIterationEnded = true;
return getHits();
} catch (Exception e) {
throw new SearchException(e);
}
}
use of gate.creole.annic.SearchException in project gate-core by GateNLP.
the class StatsCalculator method freqForAllValues.
/**
* Calculates frequencies for all possible values of the provided AT.feature
* @param patternsToSearchIn
* @param annotationType
* @param feature
* @param inMatchedSpan
* @param inContext
* @return returns a map where key is the unique value of AT.feature and value is the Integer object giving count for the value.
* @throws SearchException
*/
public static Map<String, Integer> freqForAllValues(List<Hit> patternsToSearchIn, String annotationType, String feature, boolean inMatchedSpan, boolean inContext) throws SearchException {
Map<String, Integer> toReturn = new HashMap<String, Integer>();
if (patternsToSearchIn == null || patternsToSearchIn.isEmpty())
return toReturn;
if (!inMatchedSpan && !inContext)
throw new SearchException("Both inMatchedSpan and inContext cannot be set to false");
for (Hit aResult1 : patternsToSearchIn) {
Pattern aResult = (Pattern) aResult1;
List<PatternAnnotation> annots = new ArrayList<PatternAnnotation>();
if (inMatchedSpan && !inContext) {
annots = aResult.getPatternAnnotations(aResult.getStartOffset(), aResult.getEndOffset());
} else if (!inMatchedSpan && inContext) {
annots = aResult.getPatternAnnotations(aResult.getLeftContextStartOffset(), aResult.getStartOffset());
annots.addAll(aResult.getPatternAnnotations(aResult.getEndOffset(), aResult.getRightContextEndOffset()));
} else {
// both matchedSpan and context are set to true
annots = Arrays.asList(aResult.getPatternAnnotations());
}
if (annots.isEmpty())
continue;
List<PatternAnnotation> subAnnots = getPatternAnnotations(annots, annotationType, feature);
for (PatternAnnotation pa : subAnnots) {
String uniqueKey = pa.getFeatures().get(feature);
Integer counter = toReturn.get(uniqueKey);
if (counter == null) {
counter = 1;
toReturn.put(uniqueKey, counter);
} else {
counter = counter.intValue() + 1;
toReturn.put(uniqueKey, counter);
}
}
}
return toReturn;
}
use of gate.creole.annic.SearchException in project gate-core by GateNLP.
the class StatsCalculator method freq.
/**
* Allows retriving frequencies for the given parameters. Please make
* sure that you close the searcher on your own. Failing to do so may
* result into many files being opened at the same time and that can
* cause the problem with your OS.
* @throws SearchException
*/
public static int freq(IndexSearcher searcher, String corpusToSearchIn, String annotationSetToSearchIn, String annotationType, String featureName, String value) throws SearchException {
try {
corpusToSearchIn = corpusToSearchIn == null || corpusToSearchIn.trim().length() == 0 ? null : corpusToSearchIn.trim();
annotationSetToSearchIn = annotationSetToSearchIn == null || annotationSetToSearchIn.trim().length() == 0 ? null : annotationSetToSearchIn.trim();
if (annotationType == null)
throw new SearchException("Annotation Type cannot be null");
// term that contains a value to be searched in the index
Term term = null;
if (featureName == null && value == null) {
term = new Term("contents", annotationType, "*");
} else if (featureName != null && value == null) {
term = new Term("contents", annotationType + "." + featureName, "**");
} else if (featureName == null) {
throw new SearchException("FeatureName cannot be null");
} else {
term = new Term("contents", value, annotationType + "." + featureName);
}
// term query
TermQuery tq = new TermQuery(term);
// indicates whether we want to use booleanQuery
boolean useBooleanQuery = false;
BooleanQuery bq = new BooleanQuery();
if (corpusToSearchIn != null) {
PhraseQuery cq = new PhraseQuery();
cq.add(new Term(Constants.CORPUS_ID, corpusToSearchIn), 0, true);
bq.add(cq, true, false);
useBooleanQuery = true;
}
if (annotationSetToSearchIn != null) {
PhraseQuery aq = new PhraseQuery();
aq.add(new Term(Constants.ANNOTATION_SET_ID, annotationSetToSearchIn), 0, true);
bq.add(aq, true, false);
useBooleanQuery = true;
}
Hits corpusHits = null;
if (useBooleanQuery) {
bq.add(tq, true, false);
corpusHits = searcher.search(bq);
} else {
corpusHits = searcher.search(tq);
}
List<?>[] firstTermPositions = searcher.getFirstTermPositions();
// if no result available, set null to our scores
if (firstTermPositions[0].size() == 0) {
return 0;
}
int size = 0;
// information
for (int hitIndex = 0; hitIndex < corpusHits.length(); hitIndex++) {
int index = firstTermPositions[0].indexOf(new Integer(corpusHits.id(hitIndex)));
// we fetch all the first term positions for the query
// issued
Integer freq = (Integer) firstTermPositions[4].get(index);
size += freq.intValue();
}
return size;
} catch (IOException ioe) {
throw new SearchException(ioe);
} finally {
searcher.initializeTermPositions();
}
}
use of gate.creole.annic.SearchException in project gate-core by GateNLP.
the class LuceneSearchThread method search.
/**
* This method collects the necessary information from lucene and uses
* it when the next method is called
*
* @param query query supplied by the user
* @param patternWindow number of tokens to refer on left and right
* context
* @param indexLocation location of the index the searcher should
* search in
* @param luceneSearcher an instance of lucene search from where the
* instance of SearchThread is invoked
* @return true iff search was successful false otherwise
*/
@SuppressWarnings("unchecked")
public boolean search(String query, int patternWindow, String indexLocation, String corpusToSearchIn, String annotationSetToSearchIn, LuceneSearcher luceneSearcher) throws SearchException {
this.query = query;
this.contextWindow = patternWindow;
this.indexLocation = indexLocation;
this.queryParser = new QueryParser();
this.luceneSearcher = luceneSearcher;
/*
* reset all parameters that keep track of where we are in our
* searching. These parameters are used mostly to keep track of
* where to start fetching the next results from
*/
searchResultInfoMap = new HashMap<String, List<QueryItem>>();
serializedFileIDIndex = 0;
queryItemIndex = 0;
serializedFilesIDsList = new ArrayList<String>();
ftpIndex = -1;
success = false;
fwdIterationEnded = false;
try {
// first find out the location of Index
// TODO does this just replace \ with / if so we should do this better
StringBuilder temp = new StringBuilder();
for (int i = 0; i < indexLocation.length(); i++) {
if (indexLocation.charAt(i) == '\\') {
temp.append("/");
} else {
temp.append(indexLocation.charAt(i));
}
}
indexLocation = temp.toString();
/*
* for each different location there can be different
* baseTokenAnnotationType each index will have their index
* Definition file stored under the index directory so first see
* if given location is a valid directory
*/
File locationFile = new File(indexLocation);
if (!locationFile.isDirectory()) {
System.out.println("Skipping the invalid Index Location :" + indexLocation);
return false;
}
if (!indexLocation.endsWith("/")) {
indexLocation += "/";
}
// otherwise let us read the index definition file
locationFile = new File(indexLocation + "LuceneIndexDefinition.xml");
// check if this file is available
if (!locationFile.exists()) {
System.out.println("Index Definition file not found - Skipping the invalid Index Location :" + indexLocation + "LuceneIndexDefinition.xml");
return false;
}
Map<String, Object> indexInformation = null;
// other wise read this file
XStream xstream = new XStream(new StaxDriver());
try (FileReader fileReader = new FileReader(indexLocation + "LuceneIndexDefinition.xml")) {
// Saving was accomplished by using XML serialization of the map.
indexInformation = (Map<String, Object>) xstream.fromXML(fileReader);
}
// find out if the current index was indexed by annicIndexPR
String indexedWithANNICIndexPR = (String) indexInformation.get(Constants.CORPUS_INDEX_FEATURE);
if (indexedWithANNICIndexPR == null || !indexedWithANNICIndexPR.equals(Constants.CORPUS_INDEX_FEATURE_VALUE)) {
System.out.println("This corpus was not indexed by Annic Index PR - Skipping the invalid Index");
return false;
}
// find out the baseTokenAnnotationType name
baseTokenAnnotationType = ((String) indexInformation.get(Constants.BASE_TOKEN_ANNOTATION_TYPE)).trim();
int separatorIndex = baseTokenAnnotationType.lastIndexOf('.');
if (separatorIndex >= 0) {
baseTokenAnnotationType = baseTokenAnnotationType.substring(separatorIndex + 1);
}
// create various Queries from the user's query
Query[] luceneQueries = queryParser.parse("contents", query, baseTokenAnnotationType, corpusToSearchIn, annotationSetToSearchIn);
if (queryParser.needValidation()) {
if (DEBUG)
System.out.println("Validation enabled!");
} else {
if (DEBUG)
System.out.println("Validation disabled!");
}
// create an instance of Index Searcher
LuceneIndexSearcher searcher = new LuceneIndexSearcher(indexLocation);
try {
// we need to iterate through one query at a time
for (int luceneQueryIndex = 0; luceneQueryIndex < luceneQueries.length; luceneQueryIndex++) {
/*
* this call reinitializes the first Term positions arraylists
* which are being used to store the results
*/
searcher.initializeTermPositions();
/*
* and now execute the query result of which will be stored in
* hits
*/
Hits hits = searcher.search(luceneQueries[luceneQueryIndex]);
/*
* and so now find out the positions of the first terms in the
* returned results. first term position is the position of the
* first term in the found pattern
*/
List<?>[] firstTermPositions = searcher.getFirstTermPositions();
// if no result available, set null to our scores
if (firstTermPositions[0].size() == 0) {
// do nothing
continue;
}
// information
for (int hitIndex = 0; hitIndex < hits.length(); hitIndex++) {
int index = firstTermPositions[0].indexOf(Integer.valueOf(hits.id(hitIndex)));
// we fetch all the first term positions for the query
// issued
List<?> ftp = (List<?>) firstTermPositions[1].get(index);
/*
* pattern length (in terms of total number of annotations
* following one other)
*/
int patLen = ((Integer) firstTermPositions[2].get(index)).intValue();
/*
* and the type of query (if it has only one annotation in it,
* or multiple terms following them)
*/
int qType = ((Integer) firstTermPositions[3].get(index)).intValue();
// find out the documentID
String serializedFileID = hits.doc(hitIndex).get(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE);
QueryItem queryItem = new QueryItem();
queryItem.annotationSetName = hits.doc(hitIndex).get(Constants.ANNOTATION_SET_ID).intern();
queryItem.id = hits.id(hitIndex);
queryItem.documentID = hits.doc(hitIndex).get(Constants.DOCUMENT_ID).intern();
queryItem.ftp = ftp;
queryItem.patLen = patLen;
queryItem.qType = qType;
queryItem.query = luceneQueries[luceneQueryIndex];
queryItem.queryString = queryParser.getQueryString(luceneQueryIndex).intern();
/*
* all these information go in the top level arrayList. we
* create separate arrayList for each individual document
* where each element in the arrayList provides information
* about different query issued over it
*/
List<QueryItem> queryItemsList = searchResultInfoMap.get(serializedFileID);
if (queryItemsList == null) {
queryItemsList = new ArrayList<QueryItem>();
queryItemsList.add(queryItem);
searchResultInfoMap.put(serializedFileID, queryItemsList);
serializedFilesIDsList.add(serializedFileID);
} else {
// // before inserting we check if it is already added
// if(!doesAlreadyExist(queryItem, queryItemsList)) {
queryItemsList.add(queryItem);
// }
}
}
}
} finally {
searcher.close();
}
// if any result possible, return true
if (searchResultInfoMap.size() > 0)
success = true;
else
success = false;
} catch (IOException | gate.creole.ir.SearchException e) {
throw new SearchException(e);
}
return success;
}
use of gate.creole.annic.SearchException in project gate-core by GateNLP.
the class LuceneSearcher method search.
/**
* Method retunrs true/false indicating whether results were found or not.
*/
@SuppressWarnings("unchecked")
@Override
public boolean search(String query, Map<String, Object> parameters) throws SearchException {
luceneHits = null;
annicPatterns = new ArrayList<Pattern>();
annotationTypesMap = new HashMap<String, List<String>>();
luceneSearchThreads = new ArrayList<LuceneSearchThread>();
luceneSearchThreadIndex = 0;
success = false;
fwdIterationEnded = false;
wasDeleteQuery = false;
if (parameters == null)
throw new SearchException("Parameters cannot be null");
this.parameters = parameters;
/*
* lets first check if the query is to search the document names This is
* used when we only wants to search for documents stored under the specific
* corpus
*/
if (parameters.size() == 2 && parameters.get(Constants.INDEX_LOCATION_URL) != null) {
String corpusID = (String) parameters.get(Constants.CORPUS_ID);
String indexLocation = null;
try {
indexLocation = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
} catch (URISyntaxException use) {
indexLocation = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
}
if (corpusID != null && indexLocation != null) {
wasDeleteQuery = true;
Term term = new Term(Constants.CORPUS_ID, corpusID);
TermQuery tq = new TermQuery(term);
try {
gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(indexLocation);
// and now execute the query
// result of which will be stored in hits
luceneHits = searcher.search(tq);
success = luceneHits.length() > 0 ? true : false;
return success;
} catch (IOException ioe) {
ioe.printStackTrace();
throw new SearchException(ioe);
}
}
}
// check for index locations
if (parameters.get(Constants.INDEX_LOCATIONS) == null) {
String indexLocation;
try {
indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
} catch (URISyntaxException use) {
indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
}
ArrayList<String> indexLocations = new ArrayList<String>();
indexLocations.add(indexLocation);
parameters.put(Constants.INDEX_LOCATIONS, indexLocations);
}
indexLocations = new ArrayList<String>((List<? extends String>) parameters.get(Constants.INDEX_LOCATIONS));
if (indexLocations.size() == 0)
throw new SearchException("Corpus is not initialized");
// check for valid context window
if (parameters.get(Constants.CONTEXT_WINDOW) == null)
throw new SearchException("Parameter " + Constants.CONTEXT_WINDOW + " is not provided!");
contextWindow = ((Integer) parameters.get(Constants.CONTEXT_WINDOW)).intValue();
if (getContextWindow().intValue() <= 0)
throw new SearchException("Context Window must be atleast 1 or > 1");
if (query == null)
throw new SearchException("Query is not initialized");
this.query = query;
this.corpusToSearchIn = (String) parameters.get(Constants.CORPUS_ID);
this.annotationSetToSearchIn = (String) parameters.get(Constants.ANNOTATION_SET_ID);
annicPatterns = new ArrayList<Pattern>();
annotationTypesMap = new HashMap<String, List<String>>();
luceneSearchThreads = new ArrayList<LuceneSearchThread>();
// TODO: is this really useful or used to have several indexLocations ?
for (int indexCounter = 0; indexCounter < indexLocations.size(); indexCounter++) {
String location = indexLocations.get(indexCounter);
// we create a separate Thread for each index
LuceneSearchThread lst = new LuceneSearchThread();
if (lst.search(query, contextWindow, location, corpusToSearchIn, annotationSetToSearchIn, this)) {
luceneSearchThreads.add(lst);
}
}
success = luceneSearchThreads.size() > 0 ? true : false;
return success;
}
Aggregations