Search in sources :

Example 1 with IndexSearcher

use of gate.creole.annic.apache.lucene.search.IndexSearcher in project gate-core by GateNLP.

the class LuceneIndexer method getNamesOfSerializedFiles.

/**
 * This method returns a set of annotation set names that are indexed.
 */
public Set<String> getNamesOfSerializedFiles(String documentID) throws IndexException {
    String location = null;
    try {
        location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
    } catch (URISyntaxException use) {
        location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
    }
    Set<String> toReturn = new HashSet<String>();
    try {
        Term term = new Term(Constants.DOCUMENT_ID, documentID);
        TermQuery tq = new TermQuery(term);
        gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(location);
        try {
            // and now execute the query
            // result of which will be stored in hits
            Hits luceneHits = searcher.search(tq);
            for (int i = 0; i < luceneHits.length(); i++) {
                Document luceneDoc = luceneHits.doc(i);
                String documentIdOfSerializedFile = luceneDoc.get(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE);
                toReturn.add(documentIdOfSerializedFile);
            }
            return toReturn;
        } finally {
            searcher.close();
        }
    } catch (IOException ioe) {
        throw new IndexException(ioe);
    }
}
Also used : IndexSearcher(gate.creole.annic.apache.lucene.search.IndexSearcher) TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) Hits(gate.creole.annic.apache.lucene.search.Hits) IndexException(gate.creole.annic.IndexException) URISyntaxException(java.net.URISyntaxException) Term(gate.creole.annic.apache.lucene.index.Term) IOException(java.io.IOException) Document(gate.creole.annic.apache.lucene.document.Document) URL(java.net.URL) File(java.io.File) HashSet(java.util.HashSet)

Example 2 with IndexSearcher

use of gate.creole.annic.apache.lucene.search.IndexSearcher in project gate-core by GateNLP.

the class LuceneSearcher method freq.

@Override
public int freq(String corpusToSearchIn, String annotationSetToSearchIn, String annotationType, String featureName, String value) throws SearchException {
    String indexLocation;
    try {
        indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
    } catch (URISyntaxException use) {
        indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
    }
    IndexSearcher indexSearcher;
    try {
        // open the IndexSearcher
        indexSearcher = new IndexSearcher(indexLocation);
    } catch (IOException e) {
        e.printStackTrace();
        return -1;
    }
    int result = StatsCalculator.freq(indexSearcher, corpusToSearchIn, annotationSetToSearchIn, annotationType, featureName, value);
    try {
        // close the IndexSearcher
        indexSearcher.close();
    } catch (IOException ioe) {
        ioe.printStackTrace();
        return -1;
    }
    return result;
}
Also used : IndexSearcher(gate.creole.annic.apache.lucene.search.IndexSearcher) URISyntaxException(java.net.URISyntaxException) IOException(java.io.IOException) File(java.io.File) URL(java.net.URL)

Example 3 with IndexSearcher

use of gate.creole.annic.apache.lucene.search.IndexSearcher in project gate-core by GateNLP.

the class LuceneSearcher method search.

/**
 * Method retunrs true/false indicating whether results were found or not.
 */
@SuppressWarnings("unchecked")
@Override
public boolean search(String query, Map<String, Object> parameters) throws SearchException {
    luceneHits = null;
    annicPatterns = new ArrayList<Pattern>();
    annotationTypesMap = new HashMap<String, List<String>>();
    luceneSearchThreads = new ArrayList<LuceneSearchThread>();
    luceneSearchThreadIndex = 0;
    success = false;
    fwdIterationEnded = false;
    wasDeleteQuery = false;
    if (parameters == null)
        throw new SearchException("Parameters cannot be null");
    this.parameters = parameters;
    /*
     * lets first check if the query is to search the document names This is
     * used when we only wants to search for documents stored under the specific
     * corpus
     */
    if (parameters.size() == 2 && parameters.get(Constants.INDEX_LOCATION_URL) != null) {
        String corpusID = (String) parameters.get(Constants.CORPUS_ID);
        String indexLocation = null;
        try {
            indexLocation = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
        } catch (URISyntaxException use) {
            indexLocation = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
        }
        if (corpusID != null && indexLocation != null) {
            wasDeleteQuery = true;
            Term term = new Term(Constants.CORPUS_ID, corpusID);
            TermQuery tq = new TermQuery(term);
            try {
                gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(indexLocation);
                // and now execute the query
                // result of which will be stored in hits
                luceneHits = searcher.search(tq);
                success = luceneHits.length() > 0 ? true : false;
                return success;
            } catch (IOException ioe) {
                ioe.printStackTrace();
                throw new SearchException(ioe);
            }
        }
    }
    // check for index locations
    if (parameters.get(Constants.INDEX_LOCATIONS) == null) {
        String indexLocation;
        try {
            indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
        } catch (URISyntaxException use) {
            indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
        }
        ArrayList<String> indexLocations = new ArrayList<String>();
        indexLocations.add(indexLocation);
        parameters.put(Constants.INDEX_LOCATIONS, indexLocations);
    }
    indexLocations = new ArrayList<String>((List<? extends String>) parameters.get(Constants.INDEX_LOCATIONS));
    if (indexLocations.size() == 0)
        throw new SearchException("Corpus is not initialized");
    // check for valid context window
    if (parameters.get(Constants.CONTEXT_WINDOW) == null)
        throw new SearchException("Parameter " + Constants.CONTEXT_WINDOW + " is not provided!");
    contextWindow = ((Integer) parameters.get(Constants.CONTEXT_WINDOW)).intValue();
    if (getContextWindow().intValue() <= 0)
        throw new SearchException("Context Window must be atleast 1 or > 1");
    if (query == null)
        throw new SearchException("Query is not initialized");
    this.query = query;
    this.corpusToSearchIn = (String) parameters.get(Constants.CORPUS_ID);
    this.annotationSetToSearchIn = (String) parameters.get(Constants.ANNOTATION_SET_ID);
    annicPatterns = new ArrayList<Pattern>();
    annotationTypesMap = new HashMap<String, List<String>>();
    luceneSearchThreads = new ArrayList<LuceneSearchThread>();
    // TODO: is this really useful or used to have several indexLocations ?
    for (int indexCounter = 0; indexCounter < indexLocations.size(); indexCounter++) {
        String location = indexLocations.get(indexCounter);
        // we create a separate Thread for each index
        LuceneSearchThread lst = new LuceneSearchThread();
        if (lst.search(query, contextWindow, location, corpusToSearchIn, annotationSetToSearchIn, this)) {
            luceneSearchThreads.add(lst);
        }
    }
    success = luceneSearchThreads.size() > 0 ? true : false;
    return success;
}
Also used : IndexSearcher(gate.creole.annic.apache.lucene.search.IndexSearcher) Pattern(gate.creole.annic.Pattern) TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) ArrayList(java.util.ArrayList) SearchException(gate.creole.annic.SearchException) URISyntaxException(java.net.URISyntaxException) Term(gate.creole.annic.apache.lucene.index.Term) IOException(java.io.IOException) URL(java.net.URL) ArrayList(java.util.ArrayList) List(java.util.List) File(java.io.File)

Example 4 with IndexSearcher

use of gate.creole.annic.apache.lucene.search.IndexSearcher in project gate-core by GateNLP.

the class LuceneSearcher method getIndexedAnnotationSetNames.

/**
 * This method returns a set of annotation set names that are indexed. Each
 * entry has the following format:
 * <p>
 * corpusName;annotationSetName
 * </p>
 * where, the corpusName is the name of the corpus the annotationSetName
 * belongs to.
 */
@Override
public String[] getIndexedAnnotationSetNames() throws SearchException {
    String indexLocation;
    try {
        indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
    } catch (URISyntaxException use) {
        indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
    }
    annotationTypesMap = new HashMap<String, List<String>>();
    Set<String> toReturn = new HashSet<String>();
    try {
        IndexReader reader = IndexReader.open(indexLocation);
        try {
            // lets first obtain stored corpora
            TermEnum terms = reader.terms(new Term(Constants.ANNOTATION_SET_ID, ""));
            if (terms == null) {
                return new String[0];
            }
            // iterating over terms and finding out names of annotation sets indexed
            Set<String> annotSets = new HashSet<String>();
            boolean foundAnnotSet = false;
            do {
                Term t = terms.term();
                if (t == null)
                    continue;
                if (t.field().equals(Constants.ANNOTATION_SET_ID)) {
                    annotSets.add(t.text());
                    foundAnnotSet = true;
                } else {
                    if (foundAnnotSet)
                        break;
                }
            } while (terms.next());
            // but not all documents belong to corpora
            for (String annotSet : annotSets) {
                Term term = new Term(Constants.ANNOTATION_SET_ID, annotSet);
                TermQuery tq = new TermQuery(term);
                try {
                    gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(indexLocation);
                    try {
                        Hits annotSetHits = searcher.search(tq);
                        for (int i = 0; i < annotSetHits.length(); i++) {
                            Document luceneDoc = annotSetHits.doc(i);
                            String corpusID = luceneDoc.get(Constants.CORPUS_ID);
                            if (corpusID == null)
                                corpusID = "";
                            toReturn.add(corpusID + ";" + annotSet);
                            // lets create a boolean query
                            Term annotSetTerm = new Term(Constants.ANNOTATION_SET_ID, annotSet);
                            TermQuery atq = new TermQuery(annotSetTerm);
                            BooleanQuery bq = new BooleanQuery();
                            bq.add(tq, true, false);
                            bq.add(atq, true, false);
                            gate.creole.annic.apache.lucene.search.Searcher indexFeatureSearcher = new IndexSearcher(indexLocation);
                            try {
                                Hits indexFeaturesHits = searcher.search(bq);
                                for (int j = 0; j < indexFeaturesHits.length(); j++) {
                                    Document aDoc = indexFeaturesHits.doc(j);
                                    String indexedFeatures = aDoc.get(Constants.INDEXED_FEATURES);
                                    if (indexedFeatures != null) {
                                        String[] features = indexedFeatures.split(";");
                                        for (String aFeature : features) {
                                            // AnnotationType.FeatureName
                                            int index = aFeature.indexOf(".");
                                            if (index == -1) {
                                                continue;
                                            }
                                            String type = aFeature.substring(0, index);
                                            String featureName = aFeature.substring(index + 1);
                                            String key = corpusID + ";" + annotSet + ";" + type;
                                            List<String> listOfFeatures = annotationTypesMap.get(key);
                                            if (listOfFeatures == null) {
                                                listOfFeatures = new ArrayList<String>();
                                                annotationTypesMap.put(key, listOfFeatures);
                                            }
                                            if (!listOfFeatures.contains(featureName)) {
                                                listOfFeatures.add(featureName);
                                            }
                                        }
                                    }
                                }
                            } finally {
                                indexFeatureSearcher.close();
                            }
                        }
                    } finally {
                        searcher.close();
                    }
                } catch (IOException ioe) {
                    ioe.printStackTrace();
                    throw new SearchException(ioe);
                }
            }
        } finally {
            reader.close();
        }
    } catch (IOException ioe) {
        throw new SearchException(ioe);
    }
    return toReturn.toArray(new String[0]);
}
Also used : IndexSearcher(gate.creole.annic.apache.lucene.search.IndexSearcher) BooleanQuery(gate.creole.annic.apache.lucene.search.BooleanQuery) Hits(gate.creole.annic.apache.lucene.search.Hits) SearchException(gate.creole.annic.SearchException) URISyntaxException(java.net.URISyntaxException) TermEnum(gate.creole.annic.apache.lucene.index.TermEnum) Document(gate.creole.annic.apache.lucene.document.Document) URL(java.net.URL) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) Term(gate.creole.annic.apache.lucene.index.Term) IOException(java.io.IOException) IndexReader(gate.creole.annic.apache.lucene.index.IndexReader) File(java.io.File)

Aggregations

IndexSearcher (gate.creole.annic.apache.lucene.search.IndexSearcher)4 File (java.io.File)4 IOException (java.io.IOException)4 URISyntaxException (java.net.URISyntaxException)4 URL (java.net.URL)4 Term (gate.creole.annic.apache.lucene.index.Term)3 TermQuery (gate.creole.annic.apache.lucene.search.TermQuery)3 SearchException (gate.creole.annic.SearchException)2 Document (gate.creole.annic.apache.lucene.document.Document)2 Hits (gate.creole.annic.apache.lucene.search.Hits)2 ArrayList (java.util.ArrayList)2 HashSet (java.util.HashSet)2 List (java.util.List)2 IndexException (gate.creole.annic.IndexException)1 Pattern (gate.creole.annic.Pattern)1 IndexReader (gate.creole.annic.apache.lucene.index.IndexReader)1 TermEnum (gate.creole.annic.apache.lucene.index.TermEnum)1 BooleanQuery (gate.creole.annic.apache.lucene.search.BooleanQuery)1