Search in sources :

Example 1 with BooleanQuery

use of gate.creole.annic.apache.lucene.search.BooleanQuery in project gate-core by GateNLP.

the class QueryParser method parse.

/**
 * Given a query, this method parses it to convert it into one or more
 * lucene queries.
 * @throws gate.creole.ir.SearchException
 */
public Query[] parse(String field, String query, String baseTokenAnnotationType, String corpusID, String annotationSetToSearchIn) throws gate.creole.ir.SearchException {
    this.field = field;
    this.baseTokenAnnotationType = baseTokenAnnotationType;
    this.position = 0;
    // at the moment this supports only | operator
    // it also support klene operators * and +
    // implicit operator is &
    // It supports simple String queries
    // it supports eight kinds of tokens
    // 1. String (without quotes)
    // 2. "String" (with quotes)
    // 3. {AnnotationType}
    // 4. {AnnotationType==String}
    // 5. {AnnotationType=="String"}
    // 7. {AnnotationType.feature==string}
    // 8. {AnnotationType.feature=="string"}
    // Steps
    // The query would we searched from left to right order
    // returned arraylist contains queries where each query is required
    // to
    // be converted into the Phrase query
    queries = SubQueryParser.parseQuery(query);
    Query[] q = new Query[queries.size()];
    for (int i = 0; i < queries.size(); i++) {
        Query phraseQuery = createPhraseQuery(queries.get(i));
        // boolean query
        if (corpusID == null && annotationSetToSearchIn == null) {
            BooleanQuery booleanQuery = new BooleanQuery();
            Term t = new Term(Constants.ANNOTATION_SET_ID, Constants.COMBINED_SET);
            TermQuery tQuery = new TermQuery(t);
            booleanQuery.add(tQuery, false, true);
            booleanQuery.add(phraseQuery, true, false);
            q[i] = booleanQuery;
        } else {
            BooleanQuery booleanQuery = new BooleanQuery();
            booleanQuery.add(phraseQuery, true, false);
            if (corpusID != null) {
                Term t = new Term(Constants.CORPUS_ID, corpusID);
                TermQuery tQuery = new TermQuery(t);
                booleanQuery.add(tQuery, true, false);
            }
            if (annotationSetToSearchIn != null) {
                Term t = new Term(Constants.ANNOTATION_SET_ID, annotationSetToSearchIn);
                TermQuery tQuery = new TermQuery(t);
                booleanQuery.add(tQuery, true, false);
            } else {
                Term t = new Term(Constants.ANNOTATION_SET_ID, Constants.COMBINED_SET);
                TermQuery tQuery = new TermQuery(t);
                booleanQuery.add(tQuery, false, true);
            }
            q[i] = booleanQuery;
        }
    }
    return q;
}
Also used : BooleanQuery(gate.creole.annic.apache.lucene.search.BooleanQuery) TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) BooleanQuery(gate.creole.annic.apache.lucene.search.BooleanQuery) TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) PhraseQuery(gate.creole.annic.apache.lucene.search.PhraseQuery) Query(gate.creole.annic.apache.lucene.search.Query) Term(gate.creole.annic.apache.lucene.index.Term)

Example 2 with BooleanQuery

use of gate.creole.annic.apache.lucene.search.BooleanQuery in project gate-core by GateNLP.

the class StatsCalculator method freq.

/**
 * Allows retriving frequencies for the given parameters. Please make
 * sure that you close the searcher on your own. Failing to do so may
 * result into many files being opened at the same time and that can
 * cause the problem with your OS.
 * @throws SearchException
 */
public static int freq(IndexSearcher searcher, String corpusToSearchIn, String annotationSetToSearchIn, String annotationType, String featureName, String value) throws SearchException {
    try {
        corpusToSearchIn = corpusToSearchIn == null || corpusToSearchIn.trim().length() == 0 ? null : corpusToSearchIn.trim();
        annotationSetToSearchIn = annotationSetToSearchIn == null || annotationSetToSearchIn.trim().length() == 0 ? null : annotationSetToSearchIn.trim();
        if (annotationType == null)
            throw new SearchException("Annotation Type cannot be null");
        // term that contains a value to be searched in the index
        Term term = null;
        if (featureName == null && value == null) {
            term = new Term("contents", annotationType, "*");
        } else if (featureName != null && value == null) {
            term = new Term("contents", annotationType + "." + featureName, "**");
        } else if (featureName == null) {
            throw new SearchException("FeatureName cannot be null");
        } else {
            term = new Term("contents", value, annotationType + "." + featureName);
        }
        // term query
        TermQuery tq = new TermQuery(term);
        // indicates whether we want to use booleanQuery
        boolean useBooleanQuery = false;
        BooleanQuery bq = new BooleanQuery();
        if (corpusToSearchIn != null) {
            PhraseQuery cq = new PhraseQuery();
            cq.add(new Term(Constants.CORPUS_ID, corpusToSearchIn), 0, true);
            bq.add(cq, true, false);
            useBooleanQuery = true;
        }
        if (annotationSetToSearchIn != null) {
            PhraseQuery aq = new PhraseQuery();
            aq.add(new Term(Constants.ANNOTATION_SET_ID, annotationSetToSearchIn), 0, true);
            bq.add(aq, true, false);
            useBooleanQuery = true;
        }
        Hits corpusHits = null;
        if (useBooleanQuery) {
            bq.add(tq, true, false);
            corpusHits = searcher.search(bq);
        } else {
            corpusHits = searcher.search(tq);
        }
        List<?>[] firstTermPositions = searcher.getFirstTermPositions();
        // if no result available, set null to our scores
        if (firstTermPositions[0].size() == 0) {
            return 0;
        }
        int size = 0;
        // information
        for (int hitIndex = 0; hitIndex < corpusHits.length(); hitIndex++) {
            int index = firstTermPositions[0].indexOf(new Integer(corpusHits.id(hitIndex)));
            // we fetch all the first term positions for the query
            // issued
            Integer freq = (Integer) firstTermPositions[4].get(index);
            size += freq.intValue();
        }
        return size;
    } catch (IOException ioe) {
        throw new SearchException(ioe);
    } finally {
        searcher.initializeTermPositions();
    }
}
Also used : TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) BooleanQuery(gate.creole.annic.apache.lucene.search.BooleanQuery) Hits(gate.creole.annic.apache.lucene.search.Hits) PhraseQuery(gate.creole.annic.apache.lucene.search.PhraseQuery) SearchException(gate.creole.annic.SearchException) ArrayList(java.util.ArrayList) List(java.util.List) Term(gate.creole.annic.apache.lucene.index.Term) IOException(java.io.IOException)

Example 3 with BooleanQuery

use of gate.creole.annic.apache.lucene.search.BooleanQuery in project gate-core by GateNLP.

the class LuceneSearcher method getIndexedAnnotationSetNames.

/**
 * This method returns a set of annotation set names that are indexed. Each
 * entry has the following format:
 * <p>
 * corpusName;annotationSetName
 * </p>
 * where, the corpusName is the name of the corpus the annotationSetName
 * belongs to.
 */
@Override
public String[] getIndexedAnnotationSetNames() throws SearchException {
    String indexLocation;
    try {
        indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
    } catch (URISyntaxException use) {
        indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
    }
    annotationTypesMap = new HashMap<String, List<String>>();
    Set<String> toReturn = new HashSet<String>();
    try {
        IndexReader reader = IndexReader.open(indexLocation);
        try {
            // lets first obtain stored corpora
            TermEnum terms = reader.terms(new Term(Constants.ANNOTATION_SET_ID, ""));
            if (terms == null) {
                return new String[0];
            }
            // iterating over terms and finding out names of annotation sets indexed
            Set<String> annotSets = new HashSet<String>();
            boolean foundAnnotSet = false;
            do {
                Term t = terms.term();
                if (t == null)
                    continue;
                if (t.field().equals(Constants.ANNOTATION_SET_ID)) {
                    annotSets.add(t.text());
                    foundAnnotSet = true;
                } else {
                    if (foundAnnotSet)
                        break;
                }
            } while (terms.next());
            // but not all documents belong to corpora
            for (String annotSet : annotSets) {
                Term term = new Term(Constants.ANNOTATION_SET_ID, annotSet);
                TermQuery tq = new TermQuery(term);
                try {
                    gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(indexLocation);
                    try {
                        Hits annotSetHits = searcher.search(tq);
                        for (int i = 0; i < annotSetHits.length(); i++) {
                            Document luceneDoc = annotSetHits.doc(i);
                            String corpusID = luceneDoc.get(Constants.CORPUS_ID);
                            if (corpusID == null)
                                corpusID = "";
                            toReturn.add(corpusID + ";" + annotSet);
                            // lets create a boolean query
                            Term annotSetTerm = new Term(Constants.ANNOTATION_SET_ID, annotSet);
                            TermQuery atq = new TermQuery(annotSetTerm);
                            BooleanQuery bq = new BooleanQuery();
                            bq.add(tq, true, false);
                            bq.add(atq, true, false);
                            gate.creole.annic.apache.lucene.search.Searcher indexFeatureSearcher = new IndexSearcher(indexLocation);
                            try {
                                Hits indexFeaturesHits = searcher.search(bq);
                                for (int j = 0; j < indexFeaturesHits.length(); j++) {
                                    Document aDoc = indexFeaturesHits.doc(j);
                                    String indexedFeatures = aDoc.get(Constants.INDEXED_FEATURES);
                                    if (indexedFeatures != null) {
                                        String[] features = indexedFeatures.split(";");
                                        for (String aFeature : features) {
                                            // AnnotationType.FeatureName
                                            int index = aFeature.indexOf(".");
                                            if (index == -1) {
                                                continue;
                                            }
                                            String type = aFeature.substring(0, index);
                                            String featureName = aFeature.substring(index + 1);
                                            String key = corpusID + ";" + annotSet + ";" + type;
                                            List<String> listOfFeatures = annotationTypesMap.get(key);
                                            if (listOfFeatures == null) {
                                                listOfFeatures = new ArrayList<String>();
                                                annotationTypesMap.put(key, listOfFeatures);
                                            }
                                            if (!listOfFeatures.contains(featureName)) {
                                                listOfFeatures.add(featureName);
                                            }
                                        }
                                    }
                                }
                            } finally {
                                indexFeatureSearcher.close();
                            }
                        }
                    } finally {
                        searcher.close();
                    }
                } catch (IOException ioe) {
                    ioe.printStackTrace();
                    throw new SearchException(ioe);
                }
            }
        } finally {
            reader.close();
        }
    } catch (IOException ioe) {
        throw new SearchException(ioe);
    }
    return toReturn.toArray(new String[0]);
}
Also used : IndexSearcher(gate.creole.annic.apache.lucene.search.IndexSearcher) BooleanQuery(gate.creole.annic.apache.lucene.search.BooleanQuery) Hits(gate.creole.annic.apache.lucene.search.Hits) SearchException(gate.creole.annic.SearchException) URISyntaxException(java.net.URISyntaxException) TermEnum(gate.creole.annic.apache.lucene.index.TermEnum) Document(gate.creole.annic.apache.lucene.document.Document) URL(java.net.URL) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) Term(gate.creole.annic.apache.lucene.index.Term) IOException(java.io.IOException) IndexReader(gate.creole.annic.apache.lucene.index.IndexReader) File(java.io.File)

Aggregations

Term (gate.creole.annic.apache.lucene.index.Term)3 BooleanQuery (gate.creole.annic.apache.lucene.search.BooleanQuery)3 TermQuery (gate.creole.annic.apache.lucene.search.TermQuery)3 SearchException (gate.creole.annic.SearchException)2 Hits (gate.creole.annic.apache.lucene.search.Hits)2 PhraseQuery (gate.creole.annic.apache.lucene.search.PhraseQuery)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 Document (gate.creole.annic.apache.lucene.document.Document)1 IndexReader (gate.creole.annic.apache.lucene.index.IndexReader)1 TermEnum (gate.creole.annic.apache.lucene.index.TermEnum)1 IndexSearcher (gate.creole.annic.apache.lucene.search.IndexSearcher)1 Query (gate.creole.annic.apache.lucene.search.Query)1 File (java.io.File)1 URISyntaxException (java.net.URISyntaxException)1 URL (java.net.URL)1 HashSet (java.util.HashSet)1