use of gate.creole.annic.apache.lucene.index.TermEnum in project gate-core by GateNLP.
the class FieldCacheImpl method getStringIndex.
// inherit javadocs
@Override
public StringIndex getStringIndex(IndexReader reader, String field) throws IOException {
field = field.intern();
Object ret = lookup(reader, field, STRING_INDEX);
if (ret == null) {
final int[] retArray = new int[reader.maxDoc()];
String[] mterms = new String[reader.maxDoc() + 1];
if (retArray.length > 0) {
TermDocs termDocs = reader.termDocs();
TermEnum termEnum = reader.terms(new Term(field, ""));
// current term number
int t = 0;
// an entry for documents that have no terms in this field
// should a document with no terms be at top or bottom?
// this puts them at the top - if it is changed, FieldDocSortedHitQueue
// needs to change as well.
mterms[t++] = null;
try {
if (termEnum.term() == null) {
throw new RuntimeException("no terms in field " + field);
}
do {
Term term = termEnum.term();
if (term.field() != field)
break;
// we expect that there is at most one term per document
if (t >= mterms.length)
throw new RuntimeException("there are more terms than documents in field \"" + field + "\"");
mterms[t] = term.text();
termDocs.seek(termEnum);
while (termDocs.next()) {
retArray[termDocs.doc()] = t;
}
t++;
} while (termEnum.next());
} finally {
termDocs.close();
termEnum.close();
}
if (t == 0) {
// if there are no terms, make the term array
// have a single null entry
mterms = new String[1];
} else if (t < mterms.length) {
// if there are less terms than documents,
// trim off the dead array space
String[] terms = new String[t];
System.arraycopy(mterms, 0, terms, 0, t);
mterms = terms;
}
}
StringIndex value = new StringIndex(retArray, mterms);
store(reader, field, STRING_INDEX, value);
return value;
}
return (StringIndex) ret;
}
use of gate.creole.annic.apache.lucene.index.TermEnum in project gate-core by GateNLP.
the class FieldCacheImpl method getInts.
// inherit javadocs
@Override
public int[] getInts(IndexReader reader, String field) throws IOException {
field = field.intern();
Object ret = lookup(reader, field, SortField.INT);
if (ret == null) {
final int[] retArray = new int[reader.maxDoc()];
if (retArray.length > 0) {
TermDocs termDocs = reader.termDocs();
TermEnum termEnum = reader.terms(new Term(field, ""));
try {
if (termEnum.term() == null) {
throw new RuntimeException("no terms in field " + field);
}
do {
Term term = termEnum.term();
if (term.field() != field)
break;
int termval = Integer.parseInt(term.text());
termDocs.seek(termEnum);
while (termDocs.next()) {
retArray[termDocs.doc()] = termval;
}
} while (termEnum.next());
} finally {
termDocs.close();
termEnum.close();
}
}
store(reader, field, SortField.INT, retArray);
return retArray;
}
return (int[]) ret;
}
use of gate.creole.annic.apache.lucene.index.TermEnum in project gate-core by GateNLP.
the class FieldCacheImpl method getAuto.
/**
* The pattern used to detect integer values in a field
*/
/**
* removed for java 1.3 compatibility
* protected static final Pattern pIntegers = Pattern.compile ("[0-9\\-]+");
*/
/**
* The pattern used to detect float values in a field
*/
/**
* removed for java 1.3 compatibility
* protected static final Object pFloats = Pattern.compile ("[0-9+\\-\\.eEfFdD]+");
*/
// inherit javadocs
@Override
public Object getAuto(IndexReader reader, String field) throws IOException {
field = field.intern();
Object ret = lookup(reader, field, SortField.AUTO);
if (ret == null) {
TermEnum enumerator = reader.terms(new Term(field, ""));
try {
Term term = enumerator.term();
if (term == null) {
throw new RuntimeException("no terms in field " + field + " - cannot determine sort type");
}
if (term.field() == field) {
String termtext = term.text().trim();
// Java 1.3 level code:
try {
Integer.parseInt(termtext);
ret = getInts(reader, field);
} catch (NumberFormatException nfe1) {
try {
Float.parseFloat(termtext);
ret = getFloats(reader, field);
} catch (NumberFormatException nfe2) {
ret = getStringIndex(reader, field);
}
}
if (ret != null) {
store(reader, field, SortField.AUTO, ret);
}
} else {
throw new RuntimeException("field \"" + field + "\" does not appear to be indexed");
}
} finally {
enumerator.close();
}
}
return ret;
}
use of gate.creole.annic.apache.lucene.index.TermEnum in project gate-core by GateNLP.
the class FieldCacheImpl method getCustom.
// inherit javadocs
@Override
public Comparable[] getCustom(IndexReader reader, String field, SortComparator comparator) throws IOException {
field = field.intern();
Object ret = lookup(reader, field, comparator);
if (ret == null) {
final Comparable[] retArray = new Comparable[reader.maxDoc()];
if (retArray.length > 0) {
TermDocs termDocs = reader.termDocs();
TermEnum termEnum = reader.terms(new Term(field, ""));
try {
if (termEnum.term() == null) {
throw new RuntimeException("no terms in field " + field);
}
do {
Term term = termEnum.term();
if (term.field() != field)
break;
Comparable termval = comparator.getComparable(term.text());
termDocs.seek(termEnum);
while (termDocs.next()) {
retArray[termDocs.doc()] = termval;
}
} while (termEnum.next());
} finally {
termDocs.close();
termEnum.close();
}
}
store(reader, field, SortField.CUSTOM, retArray);
return retArray;
}
return (String[]) ret;
}
use of gate.creole.annic.apache.lucene.index.TermEnum in project gate-core by GateNLP.
the class LuceneSearcher method getIndexedAnnotationSetNames.
/**
* This method returns a set of annotation set names that are indexed. Each
* entry has the following format:
* <p>
* corpusName;annotationSetName
* </p>
* where, the corpusName is the name of the corpus the annotationSetName
* belongs to.
*/
@Override
public String[] getIndexedAnnotationSetNames() throws SearchException {
String indexLocation;
try {
indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
} catch (URISyntaxException use) {
indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
}
annotationTypesMap = new HashMap<String, List<String>>();
Set<String> toReturn = new HashSet<String>();
try {
IndexReader reader = IndexReader.open(indexLocation);
try {
// lets first obtain stored corpora
TermEnum terms = reader.terms(new Term(Constants.ANNOTATION_SET_ID, ""));
if (terms == null) {
return new String[0];
}
// iterating over terms and finding out names of annotation sets indexed
Set<String> annotSets = new HashSet<String>();
boolean foundAnnotSet = false;
do {
Term t = terms.term();
if (t == null)
continue;
if (t.field().equals(Constants.ANNOTATION_SET_ID)) {
annotSets.add(t.text());
foundAnnotSet = true;
} else {
if (foundAnnotSet)
break;
}
} while (terms.next());
// but not all documents belong to corpora
for (String annotSet : annotSets) {
Term term = new Term(Constants.ANNOTATION_SET_ID, annotSet);
TermQuery tq = new TermQuery(term);
try {
gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(indexLocation);
try {
Hits annotSetHits = searcher.search(tq);
for (int i = 0; i < annotSetHits.length(); i++) {
Document luceneDoc = annotSetHits.doc(i);
String corpusID = luceneDoc.get(Constants.CORPUS_ID);
if (corpusID == null)
corpusID = "";
toReturn.add(corpusID + ";" + annotSet);
// lets create a boolean query
Term annotSetTerm = new Term(Constants.ANNOTATION_SET_ID, annotSet);
TermQuery atq = new TermQuery(annotSetTerm);
BooleanQuery bq = new BooleanQuery();
bq.add(tq, true, false);
bq.add(atq, true, false);
gate.creole.annic.apache.lucene.search.Searcher indexFeatureSearcher = new IndexSearcher(indexLocation);
try {
Hits indexFeaturesHits = searcher.search(bq);
for (int j = 0; j < indexFeaturesHits.length(); j++) {
Document aDoc = indexFeaturesHits.doc(j);
String indexedFeatures = aDoc.get(Constants.INDEXED_FEATURES);
if (indexedFeatures != null) {
String[] features = indexedFeatures.split(";");
for (String aFeature : features) {
// AnnotationType.FeatureName
int index = aFeature.indexOf(".");
if (index == -1) {
continue;
}
String type = aFeature.substring(0, index);
String featureName = aFeature.substring(index + 1);
String key = corpusID + ";" + annotSet + ";" + type;
List<String> listOfFeatures = annotationTypesMap.get(key);
if (listOfFeatures == null) {
listOfFeatures = new ArrayList<String>();
annotationTypesMap.put(key, listOfFeatures);
}
if (!listOfFeatures.contains(featureName)) {
listOfFeatures.add(featureName);
}
}
}
}
} finally {
indexFeatureSearcher.close();
}
}
} finally {
searcher.close();
}
} catch (IOException ioe) {
ioe.printStackTrace();
throw new SearchException(ioe);
}
}
} finally {
reader.close();
}
} catch (IOException ioe) {
throw new SearchException(ioe);
}
return toReturn.toArray(new String[0]);
}
Aggregations