Search in sources :

Example 1 with TermQuery

use of gate.creole.annic.apache.lucene.search.TermQuery in project gate-core by GateNLP.

the class LuceneIndexer method getNamesOfSerializedFiles.

/**
 * This method returns a set of annotation set names that are indexed.
 */
public Set<String> getNamesOfSerializedFiles(String documentID) throws IndexException {
    String location = null;
    try {
        location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
    } catch (URISyntaxException use) {
        location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
    }
    Set<String> toReturn = new HashSet<String>();
    try {
        Term term = new Term(Constants.DOCUMENT_ID, documentID);
        TermQuery tq = new TermQuery(term);
        gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(location);
        try {
            // and now execute the query
            // result of which will be stored in hits
            Hits luceneHits = searcher.search(tq);
            for (int i = 0; i < luceneHits.length(); i++) {
                Document luceneDoc = luceneHits.doc(i);
                String documentIdOfSerializedFile = luceneDoc.get(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE);
                toReturn.add(documentIdOfSerializedFile);
            }
            return toReturn;
        } finally {
            searcher.close();
        }
    } catch (IOException ioe) {
        throw new IndexException(ioe);
    }
}
Also used : IndexSearcher(gate.creole.annic.apache.lucene.search.IndexSearcher) TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) Hits(gate.creole.annic.apache.lucene.search.Hits) IndexException(gate.creole.annic.IndexException) URISyntaxException(java.net.URISyntaxException) Term(gate.creole.annic.apache.lucene.index.Term) IOException(java.io.IOException) Document(gate.creole.annic.apache.lucene.document.Document) URL(java.net.URL) File(java.io.File) HashSet(java.util.HashSet)

Example 2 with TermQuery

use of gate.creole.annic.apache.lucene.search.TermQuery in project gate-core by GateNLP.

the class QueryParser method parse.

/**
 * Given a query, this method parses it to convert it into one or more
 * lucene queries.
 * @throws gate.creole.ir.SearchException
 */
public Query[] parse(String field, String query, String baseTokenAnnotationType, String corpusID, String annotationSetToSearchIn) throws gate.creole.ir.SearchException {
    this.field = field;
    this.baseTokenAnnotationType = baseTokenAnnotationType;
    this.position = 0;
    // at the moment this supports only | operator
    // it also support klene operators * and +
    // implicit operator is &
    // It supports simple String queries
    // it supports eight kinds of tokens
    // 1. String (without quotes)
    // 2. "String" (with quotes)
    // 3. {AnnotationType}
    // 4. {AnnotationType==String}
    // 5. {AnnotationType=="String"}
    // 7. {AnnotationType.feature==string}
    // 8. {AnnotationType.feature=="string"}
    // Steps
    // The query would we searched from left to right order
    // returned arraylist contains queries where each query is required
    // to
    // be converted into the Phrase query
    queries = SubQueryParser.parseQuery(query);
    Query[] q = new Query[queries.size()];
    for (int i = 0; i < queries.size(); i++) {
        Query phraseQuery = createPhraseQuery(queries.get(i));
        // boolean query
        if (corpusID == null && annotationSetToSearchIn == null) {
            BooleanQuery booleanQuery = new BooleanQuery();
            Term t = new Term(Constants.ANNOTATION_SET_ID, Constants.COMBINED_SET);
            TermQuery tQuery = new TermQuery(t);
            booleanQuery.add(tQuery, false, true);
            booleanQuery.add(phraseQuery, true, false);
            q[i] = booleanQuery;
        } else {
            BooleanQuery booleanQuery = new BooleanQuery();
            booleanQuery.add(phraseQuery, true, false);
            if (corpusID != null) {
                Term t = new Term(Constants.CORPUS_ID, corpusID);
                TermQuery tQuery = new TermQuery(t);
                booleanQuery.add(tQuery, true, false);
            }
            if (annotationSetToSearchIn != null) {
                Term t = new Term(Constants.ANNOTATION_SET_ID, annotationSetToSearchIn);
                TermQuery tQuery = new TermQuery(t);
                booleanQuery.add(tQuery, true, false);
            } else {
                Term t = new Term(Constants.ANNOTATION_SET_ID, Constants.COMBINED_SET);
                TermQuery tQuery = new TermQuery(t);
                booleanQuery.add(tQuery, false, true);
            }
            q[i] = booleanQuery;
        }
    }
    return q;
}
Also used : BooleanQuery(gate.creole.annic.apache.lucene.search.BooleanQuery) TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) BooleanQuery(gate.creole.annic.apache.lucene.search.BooleanQuery) TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) PhraseQuery(gate.creole.annic.apache.lucene.search.PhraseQuery) Query(gate.creole.annic.apache.lucene.search.Query) Term(gate.creole.annic.apache.lucene.index.Term)

Example 3 with TermQuery

use of gate.creole.annic.apache.lucene.search.TermQuery in project gate-core by GateNLP.

the class QueryParser method createPhraseQuery.

/**
 * This method will create each normalized query into a Phrase or Term
 * query If the query has only one term to search, it will be returned
 * as a TermQuery otherwise, it will be returned as the PhraseQuery
 */
private Query createPhraseQuery(String query) throws gate.creole.ir.SearchException {
    // Here we play the actual trick with lucene
    // For a query like {Lookup}{Token}{Person.gender=="male"}
    // internally this query is converted into the following PhraseQuery
    // (Lookup Token Person male)
    // these are the four terms which will be searched and they should
    // occur
    // in this order only
    // but what we need is
    // a pattern where
    // Lookup -> the first annotation is of type Lookup
    // Token -> the second annotation type is Token
    // Person male -> and the third annotation must have a type person
    // and a
    // feature gender with male
    // that means Person and male should be considered at the same
    // location
    // By default lucene doesn't do this and look for a position that is
    // 1
    // step more than the previous one
    // so it will search for the first position of Lookup
    // let say it is 19 (i.e. 19th annotation in the document)
    // then it would consider 20th location for Token
    // 21st for Person
    // 22nd for male
    // but we need, 19th for Lookup, 20th for Token and 21st for both
    // Person
    // and Male
    // so from here itself we send our choice for the Location of
    // annotations in this termPositions array :-).
    // isn't it a great crack?
    position = 0;
    PhraseQuery phQuery = new PhraseQuery();
    // we will tokenize this query to convert it into different tokens
    // query is like {Person}"said" "Hello" {Person.gender=="male"}
    // we need to convert this into different tokens
    // {Person}
    // "said"
    // "Hello"
    // {Person.gender=="male"}
    List<String> tokens = findTokens(query);
    // and then convert each token into separate terms
    if (tokens.size() == 1) {
        List<?>[] termsPos = createTerms(tokens.get(0));
        @SuppressWarnings("unchecked") List<Term> terms = (List<Term>) termsPos[0];
        if (terms.size() == 1) {
            if (areAllTermsTokens)
                needValidation = false;
            else
                needValidation = true;
            return new TermQuery(terms.get(0));
        } else {
            position = 0;
        }
    }
    int totalTerms = 0;
    boolean hadPreviousTermsAToken = true;
    needValidation = false;
    // and now for each token we need to create Term(s)
    outer: for (int i = 0; i < tokens.size(); i++) {
        List<?>[] termpositions = createTerms(tokens.get(i));
        @SuppressWarnings("unchecked") List<Term> terms = (List<Term>) termpositions[0];
        @SuppressWarnings("unchecked") List<Integer> pos = (List<Integer>) termpositions[1];
        @SuppressWarnings("unchecked") List<Boolean> consider = (List<Boolean>) termpositions[2];
        boolean allTermsTokens = true;
        // lets first find out if there's any token in this terms
        for (int k = 0; k < terms.size(); k++) {
            Term t = terms.get(k);
            if (allTermsTokens)
                allTermsTokens = isBaseTokenTerm(t);
        }
        if (!hadPreviousTermsAToken) {
            needValidation = true;
            break;
        }
        if (!allTermsTokens) {
            // we want to break here
            needValidation = true;
            if (i > 0)
                break outer;
        }
        for (int k = 0; k < terms.size(); k++) {
            Term t = terms.get(k);
            boolean considerValue = consider.get(k).booleanValue();
            phQuery.add(t, pos.get(k), considerValue);
            if (considerValue)
                totalTerms++;
        }
        hadPreviousTermsAToken = allTermsTokens;
    }
    phQuery.setTotalTerms(totalTerms);
    return phQuery;
}
Also used : TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) PhraseQuery(gate.creole.annic.apache.lucene.search.PhraseQuery) Term(gate.creole.annic.apache.lucene.index.Term) List(java.util.List) ArrayList(java.util.ArrayList)

Example 4 with TermQuery

use of gate.creole.annic.apache.lucene.search.TermQuery in project gate-core by GateNLP.

the class StatsCalculator method freq.

/**
 * Allows retriving frequencies for the given parameters. Please make
 * sure that you close the searcher on your own. Failing to do so may
 * result into many files being opened at the same time and that can
 * cause the problem with your OS.
 * @throws SearchException
 */
public static int freq(IndexSearcher searcher, String corpusToSearchIn, String annotationSetToSearchIn, String annotationType, String featureName, String value) throws SearchException {
    try {
        corpusToSearchIn = corpusToSearchIn == null || corpusToSearchIn.trim().length() == 0 ? null : corpusToSearchIn.trim();
        annotationSetToSearchIn = annotationSetToSearchIn == null || annotationSetToSearchIn.trim().length() == 0 ? null : annotationSetToSearchIn.trim();
        if (annotationType == null)
            throw new SearchException("Annotation Type cannot be null");
        // term that contains a value to be searched in the index
        Term term = null;
        if (featureName == null && value == null) {
            term = new Term("contents", annotationType, "*");
        } else if (featureName != null && value == null) {
            term = new Term("contents", annotationType + "." + featureName, "**");
        } else if (featureName == null) {
            throw new SearchException("FeatureName cannot be null");
        } else {
            term = new Term("contents", value, annotationType + "." + featureName);
        }
        // term query
        TermQuery tq = new TermQuery(term);
        // indicates whether we want to use booleanQuery
        boolean useBooleanQuery = false;
        BooleanQuery bq = new BooleanQuery();
        if (corpusToSearchIn != null) {
            PhraseQuery cq = new PhraseQuery();
            cq.add(new Term(Constants.CORPUS_ID, corpusToSearchIn), 0, true);
            bq.add(cq, true, false);
            useBooleanQuery = true;
        }
        if (annotationSetToSearchIn != null) {
            PhraseQuery aq = new PhraseQuery();
            aq.add(new Term(Constants.ANNOTATION_SET_ID, annotationSetToSearchIn), 0, true);
            bq.add(aq, true, false);
            useBooleanQuery = true;
        }
        Hits corpusHits = null;
        if (useBooleanQuery) {
            bq.add(tq, true, false);
            corpusHits = searcher.search(bq);
        } else {
            corpusHits = searcher.search(tq);
        }
        List<?>[] firstTermPositions = searcher.getFirstTermPositions();
        // if no result available, set null to our scores
        if (firstTermPositions[0].size() == 0) {
            return 0;
        }
        int size = 0;
        // information
        for (int hitIndex = 0; hitIndex < corpusHits.length(); hitIndex++) {
            int index = firstTermPositions[0].indexOf(new Integer(corpusHits.id(hitIndex)));
            // we fetch all the first term positions for the query
            // issued
            Integer freq = (Integer) firstTermPositions[4].get(index);
            size += freq.intValue();
        }
        return size;
    } catch (IOException ioe) {
        throw new SearchException(ioe);
    } finally {
        searcher.initializeTermPositions();
    }
}
Also used : TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) BooleanQuery(gate.creole.annic.apache.lucene.search.BooleanQuery) Hits(gate.creole.annic.apache.lucene.search.Hits) PhraseQuery(gate.creole.annic.apache.lucene.search.PhraseQuery) SearchException(gate.creole.annic.SearchException) ArrayList(java.util.ArrayList) List(java.util.List) Term(gate.creole.annic.apache.lucene.index.Term) IOException(java.io.IOException)

Example 5 with TermQuery

use of gate.creole.annic.apache.lucene.search.TermQuery in project gate-core by GateNLP.

the class LuceneSearcher method search.

/**
 * Method retunrs true/false indicating whether results were found or not.
 */
@SuppressWarnings("unchecked")
@Override
public boolean search(String query, Map<String, Object> parameters) throws SearchException {
    luceneHits = null;
    annicPatterns = new ArrayList<Pattern>();
    annotationTypesMap = new HashMap<String, List<String>>();
    luceneSearchThreads = new ArrayList<LuceneSearchThread>();
    luceneSearchThreadIndex = 0;
    success = false;
    fwdIterationEnded = false;
    wasDeleteQuery = false;
    if (parameters == null)
        throw new SearchException("Parameters cannot be null");
    this.parameters = parameters;
    /*
     * lets first check if the query is to search the document names This is
     * used when we only wants to search for documents stored under the specific
     * corpus
     */
    if (parameters.size() == 2 && parameters.get(Constants.INDEX_LOCATION_URL) != null) {
        String corpusID = (String) parameters.get(Constants.CORPUS_ID);
        String indexLocation = null;
        try {
            indexLocation = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
        } catch (URISyntaxException use) {
            indexLocation = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
        }
        if (corpusID != null && indexLocation != null) {
            wasDeleteQuery = true;
            Term term = new Term(Constants.CORPUS_ID, corpusID);
            TermQuery tq = new TermQuery(term);
            try {
                gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(indexLocation);
                // and now execute the query
                // result of which will be stored in hits
                luceneHits = searcher.search(tq);
                success = luceneHits.length() > 0 ? true : false;
                return success;
            } catch (IOException ioe) {
                ioe.printStackTrace();
                throw new SearchException(ioe);
            }
        }
    }
    // check for index locations
    if (parameters.get(Constants.INDEX_LOCATIONS) == null) {
        String indexLocation;
        try {
            indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
        } catch (URISyntaxException use) {
            indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
        }
        ArrayList<String> indexLocations = new ArrayList<String>();
        indexLocations.add(indexLocation);
        parameters.put(Constants.INDEX_LOCATIONS, indexLocations);
    }
    indexLocations = new ArrayList<String>((List<? extends String>) parameters.get(Constants.INDEX_LOCATIONS));
    if (indexLocations.size() == 0)
        throw new SearchException("Corpus is not initialized");
    // check for valid context window
    if (parameters.get(Constants.CONTEXT_WINDOW) == null)
        throw new SearchException("Parameter " + Constants.CONTEXT_WINDOW + " is not provided!");
    contextWindow = ((Integer) parameters.get(Constants.CONTEXT_WINDOW)).intValue();
    if (getContextWindow().intValue() <= 0)
        throw new SearchException("Context Window must be atleast 1 or > 1");
    if (query == null)
        throw new SearchException("Query is not initialized");
    this.query = query;
    this.corpusToSearchIn = (String) parameters.get(Constants.CORPUS_ID);
    this.annotationSetToSearchIn = (String) parameters.get(Constants.ANNOTATION_SET_ID);
    annicPatterns = new ArrayList<Pattern>();
    annotationTypesMap = new HashMap<String, List<String>>();
    luceneSearchThreads = new ArrayList<LuceneSearchThread>();
    // TODO: is this really useful or used to have several indexLocations ?
    for (int indexCounter = 0; indexCounter < indexLocations.size(); indexCounter++) {
        String location = indexLocations.get(indexCounter);
        // we create a separate Thread for each index
        LuceneSearchThread lst = new LuceneSearchThread();
        if (lst.search(query, contextWindow, location, corpusToSearchIn, annotationSetToSearchIn, this)) {
            luceneSearchThreads.add(lst);
        }
    }
    success = luceneSearchThreads.size() > 0 ? true : false;
    return success;
}
Also used : IndexSearcher(gate.creole.annic.apache.lucene.search.IndexSearcher) Pattern(gate.creole.annic.Pattern) TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) ArrayList(java.util.ArrayList) SearchException(gate.creole.annic.SearchException) URISyntaxException(java.net.URISyntaxException) Term(gate.creole.annic.apache.lucene.index.Term) IOException(java.io.IOException) URL(java.net.URL) ArrayList(java.util.ArrayList) List(java.util.List) File(java.io.File)

Aggregations

Term (gate.creole.annic.apache.lucene.index.Term)6 TermQuery (gate.creole.annic.apache.lucene.search.TermQuery)6 IOException (java.io.IOException)4 ArrayList (java.util.ArrayList)4 List (java.util.List)4 SearchException (gate.creole.annic.SearchException)3 BooleanQuery (gate.creole.annic.apache.lucene.search.BooleanQuery)3 Hits (gate.creole.annic.apache.lucene.search.Hits)3 IndexSearcher (gate.creole.annic.apache.lucene.search.IndexSearcher)3 PhraseQuery (gate.creole.annic.apache.lucene.search.PhraseQuery)3 File (java.io.File)3 URISyntaxException (java.net.URISyntaxException)3 URL (java.net.URL)3 Document (gate.creole.annic.apache.lucene.document.Document)2 HashSet (java.util.HashSet)2 IndexException (gate.creole.annic.IndexException)1 Pattern (gate.creole.annic.Pattern)1 IndexReader (gate.creole.annic.apache.lucene.index.IndexReader)1 TermEnum (gate.creole.annic.apache.lucene.index.TermEnum)1 Query (gate.creole.annic.apache.lucene.search.Query)1