Search in sources :

Example 1 with TermEnum

use of gate.creole.annic.apache.lucene.index.TermEnum in project gate-core by GateNLP.

the class FieldCacheImpl method getStringIndex.

// inherit javadocs
@Override
public StringIndex getStringIndex(IndexReader reader, String field) throws IOException {
    field = field.intern();
    Object ret = lookup(reader, field, STRING_INDEX);
    if (ret == null) {
        final int[] retArray = new int[reader.maxDoc()];
        String[] mterms = new String[reader.maxDoc() + 1];
        if (retArray.length > 0) {
            TermDocs termDocs = reader.termDocs();
            TermEnum termEnum = reader.terms(new Term(field, ""));
            // current term number
            int t = 0;
            // an entry for documents that have no terms in this field
            // should a document with no terms be at top or bottom?
            // this puts them at the top - if it is changed, FieldDocSortedHitQueue
            // needs to change as well.
            mterms[t++] = null;
            try {
                if (termEnum.term() == null) {
                    throw new RuntimeException("no terms in field " + field);
                }
                do {
                    Term term = termEnum.term();
                    if (term.field() != field)
                        break;
                    // we expect that there is at most one term per document
                    if (t >= mterms.length)
                        throw new RuntimeException("there are more terms than documents in field \"" + field + "\"");
                    mterms[t] = term.text();
                    termDocs.seek(termEnum);
                    while (termDocs.next()) {
                        retArray[termDocs.doc()] = t;
                    }
                    t++;
                } while (termEnum.next());
            } finally {
                termDocs.close();
                termEnum.close();
            }
            if (t == 0) {
                // if there are no terms, make the term array
                // have a single null entry
                mterms = new String[1];
            } else if (t < mterms.length) {
                // if there are less terms than documents,
                // trim off the dead array space
                String[] terms = new String[t];
                System.arraycopy(mterms, 0, terms, 0, t);
                mterms = terms;
            }
        }
        StringIndex value = new StringIndex(retArray, mterms);
        store(reader, field, STRING_INDEX, value);
        return value;
    }
    return (StringIndex) ret;
}
Also used : TermDocs(gate.creole.annic.apache.lucene.index.TermDocs) Term(gate.creole.annic.apache.lucene.index.Term) TermEnum(gate.creole.annic.apache.lucene.index.TermEnum)

Example 2 with TermEnum

use of gate.creole.annic.apache.lucene.index.TermEnum in project gate-core by GateNLP.

the class FieldCacheImpl method getInts.

// inherit javadocs
@Override
public int[] getInts(IndexReader reader, String field) throws IOException {
    field = field.intern();
    Object ret = lookup(reader, field, SortField.INT);
    if (ret == null) {
        final int[] retArray = new int[reader.maxDoc()];
        if (retArray.length > 0) {
            TermDocs termDocs = reader.termDocs();
            TermEnum termEnum = reader.terms(new Term(field, ""));
            try {
                if (termEnum.term() == null) {
                    throw new RuntimeException("no terms in field " + field);
                }
                do {
                    Term term = termEnum.term();
                    if (term.field() != field)
                        break;
                    int termval = Integer.parseInt(term.text());
                    termDocs.seek(termEnum);
                    while (termDocs.next()) {
                        retArray[termDocs.doc()] = termval;
                    }
                } while (termEnum.next());
            } finally {
                termDocs.close();
                termEnum.close();
            }
        }
        store(reader, field, SortField.INT, retArray);
        return retArray;
    }
    return (int[]) ret;
}
Also used : TermDocs(gate.creole.annic.apache.lucene.index.TermDocs) Term(gate.creole.annic.apache.lucene.index.Term) TermEnum(gate.creole.annic.apache.lucene.index.TermEnum)

Example 3 with TermEnum

use of gate.creole.annic.apache.lucene.index.TermEnum in project gate-core by GateNLP.

the class FieldCacheImpl method getAuto.

/**
 * The pattern used to detect integer values in a field
 */
/**
 * removed for java 1.3 compatibility
 *   protected static final Pattern pIntegers = Pattern.compile ("[0-9\\-]+");
 */
/**
 * The pattern used to detect float values in a field
 */
/**
 * removed for java 1.3 compatibility
 * protected static final Object pFloats = Pattern.compile ("[0-9+\\-\\.eEfFdD]+");
 */
// inherit javadocs
@Override
public Object getAuto(IndexReader reader, String field) throws IOException {
    field = field.intern();
    Object ret = lookup(reader, field, SortField.AUTO);
    if (ret == null) {
        TermEnum enumerator = reader.terms(new Term(field, ""));
        try {
            Term term = enumerator.term();
            if (term == null) {
                throw new RuntimeException("no terms in field " + field + " - cannot determine sort type");
            }
            if (term.field() == field) {
                String termtext = term.text().trim();
                // Java 1.3 level code:
                try {
                    Integer.parseInt(termtext);
                    ret = getInts(reader, field);
                } catch (NumberFormatException nfe1) {
                    try {
                        Float.parseFloat(termtext);
                        ret = getFloats(reader, field);
                    } catch (NumberFormatException nfe2) {
                        ret = getStringIndex(reader, field);
                    }
                }
                if (ret != null) {
                    store(reader, field, SortField.AUTO, ret);
                }
            } else {
                throw new RuntimeException("field \"" + field + "\" does not appear to be indexed");
            }
        } finally {
            enumerator.close();
        }
    }
    return ret;
}
Also used : Term(gate.creole.annic.apache.lucene.index.Term) TermEnum(gate.creole.annic.apache.lucene.index.TermEnum)

Example 4 with TermEnum

use of gate.creole.annic.apache.lucene.index.TermEnum in project gate-core by GateNLP.

the class FieldCacheImpl method getCustom.

// inherit javadocs
@Override
public Comparable[] getCustom(IndexReader reader, String field, SortComparator comparator) throws IOException {
    field = field.intern();
    Object ret = lookup(reader, field, comparator);
    if (ret == null) {
        final Comparable[] retArray = new Comparable[reader.maxDoc()];
        if (retArray.length > 0) {
            TermDocs termDocs = reader.termDocs();
            TermEnum termEnum = reader.terms(new Term(field, ""));
            try {
                if (termEnum.term() == null) {
                    throw new RuntimeException("no terms in field " + field);
                }
                do {
                    Term term = termEnum.term();
                    if (term.field() != field)
                        break;
                    Comparable termval = comparator.getComparable(term.text());
                    termDocs.seek(termEnum);
                    while (termDocs.next()) {
                        retArray[termDocs.doc()] = termval;
                    }
                } while (termEnum.next());
            } finally {
                termDocs.close();
                termEnum.close();
            }
        }
        store(reader, field, SortField.CUSTOM, retArray);
        return retArray;
    }
    return (String[]) ret;
}
Also used : TermDocs(gate.creole.annic.apache.lucene.index.TermDocs) Term(gate.creole.annic.apache.lucene.index.Term) TermEnum(gate.creole.annic.apache.lucene.index.TermEnum)

Example 5 with TermEnum

use of gate.creole.annic.apache.lucene.index.TermEnum in project gate-core by GateNLP.

the class LuceneSearcher method getIndexedAnnotationSetNames.

/**
 * This method returns a set of annotation set names that are indexed. Each
 * entry has the following format:
 * <p>
 * corpusName;annotationSetName
 * </p>
 * where, the corpusName is the name of the corpus the annotationSetName
 * belongs to.
 */
@Override
public String[] getIndexedAnnotationSetNames() throws SearchException {
    String indexLocation;
    try {
        indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
    } catch (URISyntaxException use) {
        indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
    }
    annotationTypesMap = new HashMap<String, List<String>>();
    Set<String> toReturn = new HashSet<String>();
    try {
        IndexReader reader = IndexReader.open(indexLocation);
        try {
            // lets first obtain stored corpora
            TermEnum terms = reader.terms(new Term(Constants.ANNOTATION_SET_ID, ""));
            if (terms == null) {
                return new String[0];
            }
            // iterating over terms and finding out names of annotation sets indexed
            Set<String> annotSets = new HashSet<String>();
            boolean foundAnnotSet = false;
            do {
                Term t = terms.term();
                if (t == null)
                    continue;
                if (t.field().equals(Constants.ANNOTATION_SET_ID)) {
                    annotSets.add(t.text());
                    foundAnnotSet = true;
                } else {
                    if (foundAnnotSet)
                        break;
                }
            } while (terms.next());
            // but not all documents belong to corpora
            for (String annotSet : annotSets) {
                Term term = new Term(Constants.ANNOTATION_SET_ID, annotSet);
                TermQuery tq = new TermQuery(term);
                try {
                    gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(indexLocation);
                    try {
                        Hits annotSetHits = searcher.search(tq);
                        for (int i = 0; i < annotSetHits.length(); i++) {
                            Document luceneDoc = annotSetHits.doc(i);
                            String corpusID = luceneDoc.get(Constants.CORPUS_ID);
                            if (corpusID == null)
                                corpusID = "";
                            toReturn.add(corpusID + ";" + annotSet);
                            // lets create a boolean query
                            Term annotSetTerm = new Term(Constants.ANNOTATION_SET_ID, annotSet);
                            TermQuery atq = new TermQuery(annotSetTerm);
                            BooleanQuery bq = new BooleanQuery();
                            bq.add(tq, true, false);
                            bq.add(atq, true, false);
                            gate.creole.annic.apache.lucene.search.Searcher indexFeatureSearcher = new IndexSearcher(indexLocation);
                            try {
                                Hits indexFeaturesHits = searcher.search(bq);
                                for (int j = 0; j < indexFeaturesHits.length(); j++) {
                                    Document aDoc = indexFeaturesHits.doc(j);
                                    String indexedFeatures = aDoc.get(Constants.INDEXED_FEATURES);
                                    if (indexedFeatures != null) {
                                        String[] features = indexedFeatures.split(";");
                                        for (String aFeature : features) {
                                            // AnnotationType.FeatureName
                                            int index = aFeature.indexOf(".");
                                            if (index == -1) {
                                                continue;
                                            }
                                            String type = aFeature.substring(0, index);
                                            String featureName = aFeature.substring(index + 1);
                                            String key = corpusID + ";" + annotSet + ";" + type;
                                            List<String> listOfFeatures = annotationTypesMap.get(key);
                                            if (listOfFeatures == null) {
                                                listOfFeatures = new ArrayList<String>();
                                                annotationTypesMap.put(key, listOfFeatures);
                                            }
                                            if (!listOfFeatures.contains(featureName)) {
                                                listOfFeatures.add(featureName);
                                            }
                                        }
                                    }
                                }
                            } finally {
                                indexFeatureSearcher.close();
                            }
                        }
                    } finally {
                        searcher.close();
                    }
                } catch (IOException ioe) {
                    ioe.printStackTrace();
                    throw new SearchException(ioe);
                }
            }
        } finally {
            reader.close();
        }
    } catch (IOException ioe) {
        throw new SearchException(ioe);
    }
    return toReturn.toArray(new String[0]);
}
Also used : IndexSearcher(gate.creole.annic.apache.lucene.search.IndexSearcher) BooleanQuery(gate.creole.annic.apache.lucene.search.BooleanQuery) Hits(gate.creole.annic.apache.lucene.search.Hits) SearchException(gate.creole.annic.SearchException) URISyntaxException(java.net.URISyntaxException) TermEnum(gate.creole.annic.apache.lucene.index.TermEnum) Document(gate.creole.annic.apache.lucene.document.Document) URL(java.net.URL) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) Term(gate.creole.annic.apache.lucene.index.Term) IOException(java.io.IOException) IndexReader(gate.creole.annic.apache.lucene.index.IndexReader) File(java.io.File)

Aggregations

Term (gate.creole.annic.apache.lucene.index.Term)7 TermEnum (gate.creole.annic.apache.lucene.index.TermEnum)7 TermDocs (gate.creole.annic.apache.lucene.index.TermDocs)5 SearchException (gate.creole.annic.SearchException)1 Document (gate.creole.annic.apache.lucene.document.Document)1 IndexReader (gate.creole.annic.apache.lucene.index.IndexReader)1 BooleanQuery (gate.creole.annic.apache.lucene.search.BooleanQuery)1 Hits (gate.creole.annic.apache.lucene.search.Hits)1 IndexSearcher (gate.creole.annic.apache.lucene.search.IndexSearcher)1 TermQuery (gate.creole.annic.apache.lucene.search.TermQuery)1 File (java.io.File)1 IOException (java.io.IOException)1 URISyntaxException (java.net.URISyntaxException)1 URL (java.net.URL)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 List (java.util.List)1