Search in sources :

Example 1 with SearchException

use of gate.creole.annic.SearchException in project gate-core by GateNLP.

the class LuceneSearcher method next.

/**
 * Return the next numberOfHits -1 indicates all
 */
@Override
public Hit[] next(int numberOfHits) throws SearchException {
    annicPatterns = new ArrayList<Pattern>();
    if (!success) {
        this.annicPatterns = new ArrayList<Pattern>();
        return getHits();
    }
    if (fwdIterationEnded) {
        this.annicPatterns = new ArrayList<Pattern>();
        return getHits();
    }
    try {
        if (wasDeleteQuery) {
            List<String> docIDs = new ArrayList<String>();
            List<String> setNames = new ArrayList<String>();
            for (int i = 0; i < luceneHits.length(); i++) {
                Document luceneDoc = luceneHits.doc(i);
                String documentID = luceneDoc.get(Constants.DOCUMENT_ID);
                String annotationSetID = luceneDoc.get(Constants.ANNOTATION_SET_ID);
                int index = docIDs.indexOf(documentID);
                if (index == -1) {
                    docIDs.add(documentID);
                    setNames.add(annotationSetID);
                } else {
                    if (!setNames.get(index).equals(annotationSetID)) {
                        docIDs.add(documentID);
                        setNames.add(annotationSetID);
                    }
                }
            }
            Hit[] toReturn = new Hit[docIDs.size()];
            for (int i = 0; i < toReturn.length; i++) {
                toReturn[i] = new Hit(docIDs.get(i), setNames.get(i), 0, 0, "");
            }
            return toReturn;
        }
        for (; luceneSearchThreadIndex < luceneSearchThreads.size(); luceneSearchThreadIndex++) {
            LuceneSearchThread lst = luceneSearchThreads.get(luceneSearchThreadIndex);
            List<Pattern> results = lst.next(numberOfHits);
            if (results != null) {
                if (numberOfHits != -1) {
                    numberOfHits -= results.size();
                }
                this.annicPatterns.addAll(results);
                if (numberOfHits == 0) {
                    return getHits();
                }
            }
        }
        // if we are here, there wer no sufficient patterns available
        // so what we do is make success to false so that this method
        // return null on next call
        fwdIterationEnded = true;
        return getHits();
    } catch (Exception e) {
        throw new SearchException(e);
    }
}
Also used : Pattern(gate.creole.annic.Pattern) ArrayList(java.util.ArrayList) SearchException(gate.creole.annic.SearchException) Document(gate.creole.annic.apache.lucene.document.Document) URISyntaxException(java.net.URISyntaxException) IOException(java.io.IOException) SearchException(gate.creole.annic.SearchException) Hit(gate.creole.annic.Hit)

Example 2 with SearchException

use of gate.creole.annic.SearchException in project gate-core by GateNLP.

the class StatsCalculator method freqForAllValues.

/**
 * Calculates frequencies for all possible values of the provided AT.feature
 * @param patternsToSearchIn
 * @param annotationType
 * @param feature
 * @param inMatchedSpan
 * @param inContext
 * @return returns a map where key is the unique value of AT.feature and value is the Integer object giving count for the value.
 * @throws SearchException
 */
public static Map<String, Integer> freqForAllValues(List<Hit> patternsToSearchIn, String annotationType, String feature, boolean inMatchedSpan, boolean inContext) throws SearchException {
    Map<String, Integer> toReturn = new HashMap<String, Integer>();
    if (patternsToSearchIn == null || patternsToSearchIn.isEmpty())
        return toReturn;
    if (!inMatchedSpan && !inContext)
        throw new SearchException("Both inMatchedSpan and inContext cannot be set to false");
    for (Hit aResult1 : patternsToSearchIn) {
        Pattern aResult = (Pattern) aResult1;
        List<PatternAnnotation> annots = new ArrayList<PatternAnnotation>();
        if (inMatchedSpan && !inContext) {
            annots = aResult.getPatternAnnotations(aResult.getStartOffset(), aResult.getEndOffset());
        } else if (!inMatchedSpan && inContext) {
            annots = aResult.getPatternAnnotations(aResult.getLeftContextStartOffset(), aResult.getStartOffset());
            annots.addAll(aResult.getPatternAnnotations(aResult.getEndOffset(), aResult.getRightContextEndOffset()));
        } else {
            // both matchedSpan and context are set to true
            annots = Arrays.asList(aResult.getPatternAnnotations());
        }
        if (annots.isEmpty())
            continue;
        List<PatternAnnotation> subAnnots = getPatternAnnotations(annots, annotationType, feature);
        for (PatternAnnotation pa : subAnnots) {
            String uniqueKey = pa.getFeatures().get(feature);
            Integer counter = toReturn.get(uniqueKey);
            if (counter == null) {
                counter = 1;
                toReturn.put(uniqueKey, counter);
            } else {
                counter = counter.intValue() + 1;
                toReturn.put(uniqueKey, counter);
            }
        }
    }
    return toReturn;
}
Also used : Pattern(gate.creole.annic.Pattern) Hit(gate.creole.annic.Hit) HashMap(java.util.HashMap) PatternAnnotation(gate.creole.annic.PatternAnnotation) ArrayList(java.util.ArrayList) SearchException(gate.creole.annic.SearchException)

Example 3 with SearchException

use of gate.creole.annic.SearchException in project gate-core by GateNLP.

the class StatsCalculator method freq.

/**
 * Allows retriving frequencies for the given parameters. Please make
 * sure that you close the searcher on your own. Failing to do so may
 * result into many files being opened at the same time and that can
 * cause the problem with your OS.
 * @throws SearchException
 */
public static int freq(IndexSearcher searcher, String corpusToSearchIn, String annotationSetToSearchIn, String annotationType, String featureName, String value) throws SearchException {
    try {
        corpusToSearchIn = corpusToSearchIn == null || corpusToSearchIn.trim().length() == 0 ? null : corpusToSearchIn.trim();
        annotationSetToSearchIn = annotationSetToSearchIn == null || annotationSetToSearchIn.trim().length() == 0 ? null : annotationSetToSearchIn.trim();
        if (annotationType == null)
            throw new SearchException("Annotation Type cannot be null");
        // term that contains a value to be searched in the index
        Term term = null;
        if (featureName == null && value == null) {
            term = new Term("contents", annotationType, "*");
        } else if (featureName != null && value == null) {
            term = new Term("contents", annotationType + "." + featureName, "**");
        } else if (featureName == null) {
            throw new SearchException("FeatureName cannot be null");
        } else {
            term = new Term("contents", value, annotationType + "." + featureName);
        }
        // term query
        TermQuery tq = new TermQuery(term);
        // indicates whether we want to use booleanQuery
        boolean useBooleanQuery = false;
        BooleanQuery bq = new BooleanQuery();
        if (corpusToSearchIn != null) {
            PhraseQuery cq = new PhraseQuery();
            cq.add(new Term(Constants.CORPUS_ID, corpusToSearchIn), 0, true);
            bq.add(cq, true, false);
            useBooleanQuery = true;
        }
        if (annotationSetToSearchIn != null) {
            PhraseQuery aq = new PhraseQuery();
            aq.add(new Term(Constants.ANNOTATION_SET_ID, annotationSetToSearchIn), 0, true);
            bq.add(aq, true, false);
            useBooleanQuery = true;
        }
        Hits corpusHits = null;
        if (useBooleanQuery) {
            bq.add(tq, true, false);
            corpusHits = searcher.search(bq);
        } else {
            corpusHits = searcher.search(tq);
        }
        List<?>[] firstTermPositions = searcher.getFirstTermPositions();
        // if no result available, set null to our scores
        if (firstTermPositions[0].size() == 0) {
            return 0;
        }
        int size = 0;
        // information
        for (int hitIndex = 0; hitIndex < corpusHits.length(); hitIndex++) {
            int index = firstTermPositions[0].indexOf(new Integer(corpusHits.id(hitIndex)));
            // we fetch all the first term positions for the query
            // issued
            Integer freq = (Integer) firstTermPositions[4].get(index);
            size += freq.intValue();
        }
        return size;
    } catch (IOException ioe) {
        throw new SearchException(ioe);
    } finally {
        searcher.initializeTermPositions();
    }
}
Also used : TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) BooleanQuery(gate.creole.annic.apache.lucene.search.BooleanQuery) Hits(gate.creole.annic.apache.lucene.search.Hits) PhraseQuery(gate.creole.annic.apache.lucene.search.PhraseQuery) SearchException(gate.creole.annic.SearchException) ArrayList(java.util.ArrayList) List(java.util.List) Term(gate.creole.annic.apache.lucene.index.Term) IOException(java.io.IOException)

Example 4 with SearchException

use of gate.creole.annic.SearchException in project gate-core by GateNLP.

the class LuceneSearchThread method search.

/**
 * This method collects the necessary information from lucene and uses
 * it when the next method is called
 *
 * @param query query supplied by the user
 * @param patternWindow number of tokens to refer on left and right
 *          context
 * @param indexLocation location of the index the searcher should
 *          search in
 * @param luceneSearcher an instance of lucene search from where the
 *          instance of SearchThread is invoked
 * @return true iff search was successful false otherwise
 */
@SuppressWarnings("unchecked")
public boolean search(String query, int patternWindow, String indexLocation, String corpusToSearchIn, String annotationSetToSearchIn, LuceneSearcher luceneSearcher) throws SearchException {
    this.query = query;
    this.contextWindow = patternWindow;
    this.indexLocation = indexLocation;
    this.queryParser = new QueryParser();
    this.luceneSearcher = luceneSearcher;
    /*
     * reset all parameters that keep track of where we are in our
     * searching. These parameters are used mostly to keep track of
     * where to start fetching the next results from
     */
    searchResultInfoMap = new HashMap<String, List<QueryItem>>();
    serializedFileIDIndex = 0;
    queryItemIndex = 0;
    serializedFilesIDsList = new ArrayList<String>();
    ftpIndex = -1;
    success = false;
    fwdIterationEnded = false;
    try {
        // first find out the location of Index
        // TODO does this just replace \ with / if so we should do this better
        StringBuilder temp = new StringBuilder();
        for (int i = 0; i < indexLocation.length(); i++) {
            if (indexLocation.charAt(i) == '\\') {
                temp.append("/");
            } else {
                temp.append(indexLocation.charAt(i));
            }
        }
        indexLocation = temp.toString();
        /*
       * for each different location there can be different
       * baseTokenAnnotationType each index will have their index
       * Definition file stored under the index directory so first see
       * if given location is a valid directory
       */
        File locationFile = new File(indexLocation);
        if (!locationFile.isDirectory()) {
            System.out.println("Skipping the invalid Index Location :" + indexLocation);
            return false;
        }
        if (!indexLocation.endsWith("/")) {
            indexLocation += "/";
        }
        // otherwise let us read the index definition file
        locationFile = new File(indexLocation + "LuceneIndexDefinition.xml");
        // check if this file is available
        if (!locationFile.exists()) {
            System.out.println("Index Definition file not found - Skipping the invalid Index Location :" + indexLocation + "LuceneIndexDefinition.xml");
            return false;
        }
        Map<String, Object> indexInformation = null;
        // other wise read this file
        XStream xstream = new XStream(new StaxDriver());
        try (FileReader fileReader = new FileReader(indexLocation + "LuceneIndexDefinition.xml")) {
            // Saving was accomplished by using XML serialization of the map.
            indexInformation = (Map<String, Object>) xstream.fromXML(fileReader);
        }
        // find out if the current index was indexed by annicIndexPR
        String indexedWithANNICIndexPR = (String) indexInformation.get(Constants.CORPUS_INDEX_FEATURE);
        if (indexedWithANNICIndexPR == null || !indexedWithANNICIndexPR.equals(Constants.CORPUS_INDEX_FEATURE_VALUE)) {
            System.out.println("This corpus was not indexed by Annic Index PR - Skipping the invalid Index");
            return false;
        }
        // find out the baseTokenAnnotationType name
        baseTokenAnnotationType = ((String) indexInformation.get(Constants.BASE_TOKEN_ANNOTATION_TYPE)).trim();
        int separatorIndex = baseTokenAnnotationType.lastIndexOf('.');
        if (separatorIndex >= 0) {
            baseTokenAnnotationType = baseTokenAnnotationType.substring(separatorIndex + 1);
        }
        // create various Queries from the user's query
        Query[] luceneQueries = queryParser.parse("contents", query, baseTokenAnnotationType, corpusToSearchIn, annotationSetToSearchIn);
        if (queryParser.needValidation()) {
            if (DEBUG)
                System.out.println("Validation enabled!");
        } else {
            if (DEBUG)
                System.out.println("Validation disabled!");
        }
        // create an instance of Index Searcher
        LuceneIndexSearcher searcher = new LuceneIndexSearcher(indexLocation);
        try {
            // we need to iterate through one query at a time
            for (int luceneQueryIndex = 0; luceneQueryIndex < luceneQueries.length; luceneQueryIndex++) {
                /*
           * this call reinitializes the first Term positions arraylists
           * which are being used to store the results
           */
                searcher.initializeTermPositions();
                /*
           * and now execute the query result of which will be stored in
           * hits
           */
                Hits hits = searcher.search(luceneQueries[luceneQueryIndex]);
                /*
           * and so now find out the positions of the first terms in the
           * returned results. first term position is the position of the
           * first term in the found pattern
           */
                List<?>[] firstTermPositions = searcher.getFirstTermPositions();
                // if no result available, set null to our scores
                if (firstTermPositions[0].size() == 0) {
                    // do nothing
                    continue;
                }
                // information
                for (int hitIndex = 0; hitIndex < hits.length(); hitIndex++) {
                    int index = firstTermPositions[0].indexOf(Integer.valueOf(hits.id(hitIndex)));
                    // we fetch all the first term positions for the query
                    // issued
                    List<?> ftp = (List<?>) firstTermPositions[1].get(index);
                    /*
             * pattern length (in terms of total number of annotations
             * following one other)
             */
                    int patLen = ((Integer) firstTermPositions[2].get(index)).intValue();
                    /*
             * and the type of query (if it has only one annotation in it,
             * or multiple terms following them)
             */
                    int qType = ((Integer) firstTermPositions[3].get(index)).intValue();
                    // find out the documentID
                    String serializedFileID = hits.doc(hitIndex).get(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE);
                    QueryItem queryItem = new QueryItem();
                    queryItem.annotationSetName = hits.doc(hitIndex).get(Constants.ANNOTATION_SET_ID).intern();
                    queryItem.id = hits.id(hitIndex);
                    queryItem.documentID = hits.doc(hitIndex).get(Constants.DOCUMENT_ID).intern();
                    queryItem.ftp = ftp;
                    queryItem.patLen = patLen;
                    queryItem.qType = qType;
                    queryItem.query = luceneQueries[luceneQueryIndex];
                    queryItem.queryString = queryParser.getQueryString(luceneQueryIndex).intern();
                    /*
             * all these information go in the top level arrayList. we
             * create separate arrayList for each individual document
             * where each element in the arrayList provides information
             * about different query issued over it
             */
                    List<QueryItem> queryItemsList = searchResultInfoMap.get(serializedFileID);
                    if (queryItemsList == null) {
                        queryItemsList = new ArrayList<QueryItem>();
                        queryItemsList.add(queryItem);
                        searchResultInfoMap.put(serializedFileID, queryItemsList);
                        serializedFilesIDsList.add(serializedFileID);
                    } else {
                        // // before inserting we check if it is already added
                        // if(!doesAlreadyExist(queryItem, queryItemsList)) {
                        queryItemsList.add(queryItem);
                    // }
                    }
                }
            }
        } finally {
            searcher.close();
        }
        // if any result possible, return true
        if (searchResultInfoMap.size() > 0)
            success = true;
        else
            success = false;
    } catch (IOException | gate.creole.ir.SearchException e) {
        throw new SearchException(e);
    }
    return success;
}
Also used : Hits(gate.creole.annic.apache.lucene.search.Hits) Query(gate.creole.annic.apache.lucene.search.Query) SearchException(gate.creole.annic.SearchException) StaxDriver(com.thoughtworks.xstream.io.xml.StaxDriver) ArrayList(java.util.ArrayList) List(java.util.List) FileReader(java.io.FileReader) XStream(com.thoughtworks.xstream.XStream) IOException(java.io.IOException) File(java.io.File)

Example 5 with SearchException

use of gate.creole.annic.SearchException in project gate-core by GateNLP.

the class LuceneSearcher method search.

/**
 * Method retunrs true/false indicating whether results were found or not.
 */
@SuppressWarnings("unchecked")
@Override
public boolean search(String query, Map<String, Object> parameters) throws SearchException {
    luceneHits = null;
    annicPatterns = new ArrayList<Pattern>();
    annotationTypesMap = new HashMap<String, List<String>>();
    luceneSearchThreads = new ArrayList<LuceneSearchThread>();
    luceneSearchThreadIndex = 0;
    success = false;
    fwdIterationEnded = false;
    wasDeleteQuery = false;
    if (parameters == null)
        throw new SearchException("Parameters cannot be null");
    this.parameters = parameters;
    /*
     * lets first check if the query is to search the document names This is
     * used when we only wants to search for documents stored under the specific
     * corpus
     */
    if (parameters.size() == 2 && parameters.get(Constants.INDEX_LOCATION_URL) != null) {
        String corpusID = (String) parameters.get(Constants.CORPUS_ID);
        String indexLocation = null;
        try {
            indexLocation = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
        } catch (URISyntaxException use) {
            indexLocation = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
        }
        if (corpusID != null && indexLocation != null) {
            wasDeleteQuery = true;
            Term term = new Term(Constants.CORPUS_ID, corpusID);
            TermQuery tq = new TermQuery(term);
            try {
                gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(indexLocation);
                // and now execute the query
                // result of which will be stored in hits
                luceneHits = searcher.search(tq);
                success = luceneHits.length() > 0 ? true : false;
                return success;
            } catch (IOException ioe) {
                ioe.printStackTrace();
                throw new SearchException(ioe);
            }
        }
    }
    // check for index locations
    if (parameters.get(Constants.INDEX_LOCATIONS) == null) {
        String indexLocation;
        try {
            indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
        } catch (URISyntaxException use) {
            indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
        }
        ArrayList<String> indexLocations = new ArrayList<String>();
        indexLocations.add(indexLocation);
        parameters.put(Constants.INDEX_LOCATIONS, indexLocations);
    }
    indexLocations = new ArrayList<String>((List<? extends String>) parameters.get(Constants.INDEX_LOCATIONS));
    if (indexLocations.size() == 0)
        throw new SearchException("Corpus is not initialized");
    // check for valid context window
    if (parameters.get(Constants.CONTEXT_WINDOW) == null)
        throw new SearchException("Parameter " + Constants.CONTEXT_WINDOW + " is not provided!");
    contextWindow = ((Integer) parameters.get(Constants.CONTEXT_WINDOW)).intValue();
    if (getContextWindow().intValue() <= 0)
        throw new SearchException("Context Window must be atleast 1 or > 1");
    if (query == null)
        throw new SearchException("Query is not initialized");
    this.query = query;
    this.corpusToSearchIn = (String) parameters.get(Constants.CORPUS_ID);
    this.annotationSetToSearchIn = (String) parameters.get(Constants.ANNOTATION_SET_ID);
    annicPatterns = new ArrayList<Pattern>();
    annotationTypesMap = new HashMap<String, List<String>>();
    luceneSearchThreads = new ArrayList<LuceneSearchThread>();
    // TODO: is this really useful or used to have several indexLocations ?
    for (int indexCounter = 0; indexCounter < indexLocations.size(); indexCounter++) {
        String location = indexLocations.get(indexCounter);
        // we create a separate Thread for each index
        LuceneSearchThread lst = new LuceneSearchThread();
        if (lst.search(query, contextWindow, location, corpusToSearchIn, annotationSetToSearchIn, this)) {
            luceneSearchThreads.add(lst);
        }
    }
    success = luceneSearchThreads.size() > 0 ? true : false;
    return success;
}
Also used : IndexSearcher(gate.creole.annic.apache.lucene.search.IndexSearcher) Pattern(gate.creole.annic.Pattern) TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) ArrayList(java.util.ArrayList) SearchException(gate.creole.annic.SearchException) URISyntaxException(java.net.URISyntaxException) Term(gate.creole.annic.apache.lucene.index.Term) IOException(java.io.IOException) URL(java.net.URL) ArrayList(java.util.ArrayList) List(java.util.List) File(java.io.File)

Aggregations

SearchException (gate.creole.annic.SearchException)10 ArrayList (java.util.ArrayList)7 IOException (java.io.IOException)5 Pattern (gate.creole.annic.Pattern)4 List (java.util.List)4 Hit (gate.creole.annic.Hit)3 Term (gate.creole.annic.apache.lucene.index.Term)3 Hits (gate.creole.annic.apache.lucene.search.Hits)3 TermQuery (gate.creole.annic.apache.lucene.search.TermQuery)3 File (java.io.File)3 URISyntaxException (java.net.URISyntaxException)3 PatternAnnotation (gate.creole.annic.PatternAnnotation)2 Document (gate.creole.annic.apache.lucene.document.Document)2 BooleanQuery (gate.creole.annic.apache.lucene.search.BooleanQuery)2 IndexSearcher (gate.creole.annic.apache.lucene.search.IndexSearcher)2 GateRuntimeException (gate.util.GateRuntimeException)2 URL (java.net.URL)2 XStream (com.thoughtworks.xstream.XStream)1 StaxDriver (com.thoughtworks.xstream.io.xml.StaxDriver)1 SerialCorpusImpl (gate.corpora.SerialCorpusImpl)1