Search in sources :

Example 6 with Term

use of gate.creole.annic.apache.lucene.index.Term in project gate-core by GateNLP.

the class FieldCacheImpl method getStringIndex.

// inherit javadocs
@Override
public StringIndex getStringIndex(IndexReader reader, String field) throws IOException {
    field = field.intern();
    Object ret = lookup(reader, field, STRING_INDEX);
    if (ret == null) {
        final int[] retArray = new int[reader.maxDoc()];
        String[] mterms = new String[reader.maxDoc() + 1];
        if (retArray.length > 0) {
            TermDocs termDocs = reader.termDocs();
            TermEnum termEnum = reader.terms(new Term(field, ""));
            // current term number
            int t = 0;
            // an entry for documents that have no terms in this field
            // should a document with no terms be at top or bottom?
            // this puts them at the top - if it is changed, FieldDocSortedHitQueue
            // needs to change as well.
            mterms[t++] = null;
            try {
                if (termEnum.term() == null) {
                    throw new RuntimeException("no terms in field " + field);
                }
                do {
                    Term term = termEnum.term();
                    if (term.field() != field)
                        break;
                    // we expect that there is at most one term per document
                    if (t >= mterms.length)
                        throw new RuntimeException("there are more terms than documents in field \"" + field + "\"");
                    mterms[t] = term.text();
                    termDocs.seek(termEnum);
                    while (termDocs.next()) {
                        retArray[termDocs.doc()] = t;
                    }
                    t++;
                } while (termEnum.next());
            } finally {
                termDocs.close();
                termEnum.close();
            }
            if (t == 0) {
                // if there are no terms, make the term array
                // have a single null entry
                mterms = new String[1];
            } else if (t < mterms.length) {
                // if there are less terms than documents,
                // trim off the dead array space
                String[] terms = new String[t];
                System.arraycopy(mterms, 0, terms, 0, t);
                mterms = terms;
            }
        }
        StringIndex value = new StringIndex(retArray, mterms);
        store(reader, field, STRING_INDEX, value);
        return value;
    }
    return (StringIndex) ret;
}
Also used : TermDocs(gate.creole.annic.apache.lucene.index.TermDocs) Term(gate.creole.annic.apache.lucene.index.Term) TermEnum(gate.creole.annic.apache.lucene.index.TermEnum)

Example 7 with Term

use of gate.creole.annic.apache.lucene.index.Term in project gate-core by GateNLP.

the class FieldCacheImpl method getInts.

// inherit javadocs
@Override
public int[] getInts(IndexReader reader, String field) throws IOException {
    field = field.intern();
    Object ret = lookup(reader, field, SortField.INT);
    if (ret == null) {
        final int[] retArray = new int[reader.maxDoc()];
        if (retArray.length > 0) {
            TermDocs termDocs = reader.termDocs();
            TermEnum termEnum = reader.terms(new Term(field, ""));
            try {
                if (termEnum.term() == null) {
                    throw new RuntimeException("no terms in field " + field);
                }
                do {
                    Term term = termEnum.term();
                    if (term.field() != field)
                        break;
                    int termval = Integer.parseInt(term.text());
                    termDocs.seek(termEnum);
                    while (termDocs.next()) {
                        retArray[termDocs.doc()] = termval;
                    }
                } while (termEnum.next());
            } finally {
                termDocs.close();
                termEnum.close();
            }
        }
        store(reader, field, SortField.INT, retArray);
        return retArray;
    }
    return (int[]) ret;
}
Also used : TermDocs(gate.creole.annic.apache.lucene.index.TermDocs) Term(gate.creole.annic.apache.lucene.index.Term) TermEnum(gate.creole.annic.apache.lucene.index.TermEnum)

Example 8 with Term

use of gate.creole.annic.apache.lucene.index.Term in project gate-core by GateNLP.

the class FieldCacheImpl method getAuto.

/**
 * The pattern used to detect integer values in a field
 */
/**
 * removed for java 1.3 compatibility
 *   protected static final Pattern pIntegers = Pattern.compile ("[0-9\\-]+");
 */
/**
 * The pattern used to detect float values in a field
 */
/**
 * removed for java 1.3 compatibility
 * protected static final Object pFloats = Pattern.compile ("[0-9+\\-\\.eEfFdD]+");
 */
// inherit javadocs
@Override
public Object getAuto(IndexReader reader, String field) throws IOException {
    field = field.intern();
    Object ret = lookup(reader, field, SortField.AUTO);
    if (ret == null) {
        TermEnum enumerator = reader.terms(new Term(field, ""));
        try {
            Term term = enumerator.term();
            if (term == null) {
                throw new RuntimeException("no terms in field " + field + " - cannot determine sort type");
            }
            if (term.field() == field) {
                String termtext = term.text().trim();
                // Java 1.3 level code:
                try {
                    Integer.parseInt(termtext);
                    ret = getInts(reader, field);
                } catch (NumberFormatException nfe1) {
                    try {
                        Float.parseFloat(termtext);
                        ret = getFloats(reader, field);
                    } catch (NumberFormatException nfe2) {
                        ret = getStringIndex(reader, field);
                    }
                }
                if (ret != null) {
                    store(reader, field, SortField.AUTO, ret);
                }
            } else {
                throw new RuntimeException("field \"" + field + "\" does not appear to be indexed");
            }
        } finally {
            enumerator.close();
        }
    }
    return ret;
}
Also used : Term(gate.creole.annic.apache.lucene.index.Term) TermEnum(gate.creole.annic.apache.lucene.index.TermEnum)

Example 9 with Term

use of gate.creole.annic.apache.lucene.index.Term in project gate-core by GateNLP.

the class FieldCacheImpl method getCustom.

// inherit javadocs
@Override
public Comparable[] getCustom(IndexReader reader, String field, SortComparator comparator) throws IOException {
    field = field.intern();
    Object ret = lookup(reader, field, comparator);
    if (ret == null) {
        final Comparable[] retArray = new Comparable[reader.maxDoc()];
        if (retArray.length > 0) {
            TermDocs termDocs = reader.termDocs();
            TermEnum termEnum = reader.terms(new Term(field, ""));
            try {
                if (termEnum.term() == null) {
                    throw new RuntimeException("no terms in field " + field);
                }
                do {
                    Term term = termEnum.term();
                    if (term.field() != field)
                        break;
                    Comparable termval = comparator.getComparable(term.text());
                    termDocs.seek(termEnum);
                    while (termDocs.next()) {
                        retArray[termDocs.doc()] = termval;
                    }
                } while (termEnum.next());
            } finally {
                termDocs.close();
                termEnum.close();
            }
        }
        store(reader, field, SortField.CUSTOM, retArray);
        return retArray;
    }
    return (String[]) ret;
}
Also used : TermDocs(gate.creole.annic.apache.lucene.index.TermDocs) Term(gate.creole.annic.apache.lucene.index.Term) TermEnum(gate.creole.annic.apache.lucene.index.TermEnum)

Example 10 with Term

use of gate.creole.annic.apache.lucene.index.Term in project gate-core by GateNLP.

the class QueryParser method createTerms.

public List<?>[] createTerms(String elem) throws gate.creole.ir.SearchException {
    areAllTermsTokens = true;
    List<Term> terms = new ArrayList<Term>();
    List<Integer> pos = new ArrayList<Integer>();
    List<Boolean> consider = new ArrayList<Boolean>();
    elem = elem.trim();
    if (elem.charAt(0) == '{' && elem.charAt(elem.length() - 1) == '}') {
        // possible
        elem = elem.substring(1, elem.length() - 1);
        int index = elem.indexOf("==");
        int index1 = findIndexOf(elem, '.');
        if (index == -1 && index1 == -1) {
            // 3. {AnnotationType}
            // this can be {AnnotationType, AnnotationType...}
            ArrayList<String> fields = splitString(elem, ',', true);
            for (int p = 0; p < fields.size(); p++) {
                if (areAllTermsTokens && !fields.get(p).equals(baseTokenAnnotationType))
                    areAllTermsTokens = false;
                terms.add(new Term(field, norm(fields.get(p)), "*"));
                pos.add(position);
                consider.add(p == 0);
            }
            position++;
        } else if (index != -1 && index1 == -1) {
            // 4. {AnnotationType==String}
            // 5. {AnnotationType=="String"}
            ArrayList<String> fields = splitString(elem, ',', false);
            for (int p = 0; p < fields.size(); p++) {
                index = fields.get(p).indexOf("==");
                // {AnnotationType, AnnotationType=="String"}
                if (index != -1) {
                    String annotType = norm(fields.get(p).substring(0, index).trim());
                    String annotText = norm(fields.get(p).substring(index + 2, fields.get(p).length()).trim());
                    if (annotText.length() > 2 && annotText.charAt(0) == '\"' && annotText.charAt(annotText.length() - 1) == '\"') {
                        annotText = annotText.substring(1, annotText.length() - 1);
                    }
                    if (!annotType.trim().equals(baseTokenAnnotationType))
                        areAllTermsTokens = false;
                    terms.add(new Term(field, annotText, annotType + ".string"));
                    pos.add(position);
                    consider.add(p == 0);
                } else {
                    if (!(norm(fields.get(p))).equals(baseTokenAnnotationType))
                        areAllTermsTokens = false;
                    terms.add(new Term(field, norm(fields.get(p)), "*"));
                    pos.add(position);
                    consider.add(p == 0);
                }
            }
            position++;
        } else if (index == -1 && index1 != -1) {
            throw new SearchException("missing operator", "an equal operator (==) is missing", elem, (elem.indexOf("=", index1) != -1) ? elem.indexOf("=", index1) : elem.length());
        } else if (index != -1 && index1 != -1) {
            // it can be {AT, AT.f==S, AT=="S"}
            int index2 = findIndexOf(elem, ',');
            String[] subElems = null;
            if (index2 == -1) {
                subElems = new String[] { elem };
            } else {
                ArrayList<String> list = splitString(elem, ',', false);
                subElems = new String[list.size()];
                for (int k = 0; k < list.size(); k++) {
                    subElems[k] = list.get(k);
                }
            }
            int lengthTravelledSoFar = 0;
            for (int j = 0; j < subElems.length; j++) {
                // 7. {AnnotationType.feature==string}
                // 8. {AnnotationType.feature=="string"}
                index = subElems[j].indexOf("==");
                index1 = findIndexOf(subElems[j], '.');
                if (index == -1 && index1 == -1) {
                    // this is {AT}
                    if (!norm(subElems[j].trim()).equals(baseTokenAnnotationType))
                        areAllTermsTokens = false;
                    terms.add(new Term(field, norm(subElems[j].trim()), "*"));
                    pos.add(position);
                    consider.add(j == 0);
                } else if (index != -1 && index1 == -1) {
                    // this is {AT=="String"}
                    String annotType = norm(subElems[j].substring(0, index).trim());
                    String annotText = norm(subElems[j].substring(index + 2, subElems[j].length()).trim());
                    if (annotText.charAt(0) == '\"' && annotText.charAt(annotText.length() - 1) == '\"') {
                        annotText = annotText.substring(1, annotText.length() - 1);
                    }
                    if (!annotType.trim().equals(baseTokenAnnotationType))
                        areAllTermsTokens = false;
                    terms.add(new Term(field, annotText, annotType + ".string"));
                    pos.add(position);
                    consider.add(j == 0);
                } else if (index == -1 && index1 != -1) {
                    throw new SearchException("missing operator", "an equal operator (==) is missing", elem, (elem.indexOf("=", lengthTravelledSoFar) != -1) ? elem.indexOf("=", lengthTravelledSoFar) : elem.length());
                } else {
                    // this is {AT.f == "s"}
                    String annotType = norm(subElems[j].substring(0, index1).trim());
                    String featureType = norm(subElems[j].substring(index1 + 1, index).trim());
                    String featureText = norm(subElems[j].substring(index + 2, subElems[j].length()).trim());
                    if (featureText.length() > 2 && featureText.charAt(0) == '\"' && featureText.charAt(featureText.length() - 1) == '\"')
                        featureText = featureText.substring(1, featureText.length() - 1);
                    if (!annotType.trim().equals(baseTokenAnnotationType))
                        areAllTermsTokens = false;
                    terms.add(new Term(field, featureText, annotType + "." + featureType));
                    pos.add(position);
                    consider.add(j == 0);
                }
                lengthTravelledSoFar += subElems[j].length() + 1;
            }
            position++;
        }
    } else {
        // possible
        // remove all the inverted commas
        StringBuilder newString = new StringBuilder();
        char prev = ' ', ch = ' ';
        for (int i = 0; i < elem.length(); i++) {
            prev = ch;
            ch = elem.charAt(i);
            if (ch == '\"' && prev != '\\') {
                continue;
            } else {
                newString.append(ch);
            }
        }
        // there can be many tokens
        String[] subTokens = norm(newString.toString()).split("( )+");
        for (int k = 0; k < subTokens.length; k++) {
            if (subTokens[k].trim().length() > 0) {
                terms.add(new Term(field, norm(subTokens[k]), baseTokenAnnotationType + ".string"));
                pos.add(position);
                consider.add(Boolean.TRUE);
                position++;
            }
        }
    }
    return new List<?>[] { terms, pos, consider };
}
Also used : ArrayList(java.util.ArrayList) SearchException(gate.creole.ir.SearchException) Term(gate.creole.annic.apache.lucene.index.Term) List(java.util.List) ArrayList(java.util.ArrayList)

Aggregations

Term (gate.creole.annic.apache.lucene.index.Term)16 TermEnum (gate.creole.annic.apache.lucene.index.TermEnum)7 TermQuery (gate.creole.annic.apache.lucene.search.TermQuery)6 TermDocs (gate.creole.annic.apache.lucene.index.TermDocs)5 IOException (java.io.IOException)5 ArrayList (java.util.ArrayList)5 List (java.util.List)5 File (java.io.File)4 URISyntaxException (java.net.URISyntaxException)4 URL (java.net.URL)4 SearchException (gate.creole.annic.SearchException)3 BooleanQuery (gate.creole.annic.apache.lucene.search.BooleanQuery)3 Hits (gate.creole.annic.apache.lucene.search.Hits)3 IndexSearcher (gate.creole.annic.apache.lucene.search.IndexSearcher)3 PhraseQuery (gate.creole.annic.apache.lucene.search.PhraseQuery)3 IndexException (gate.creole.annic.IndexException)2 Document (gate.creole.annic.apache.lucene.document.Document)2 IndexReader (gate.creole.annic.apache.lucene.index.IndexReader)2 HashSet (java.util.HashSet)2 Pattern (gate.creole.annic.Pattern)1