use of gate.creole.annic.apache.lucene.document.Document in project gate-core by GateNLP.
the class LuceneDocument method createDocuments.
/**
* Given an instance of Gate Document, it converts it into the format that
* lucene can understand and can store in its indexes. This method also stores
* the tokenStream on the disk in order to retrieve it at the time of
* searching
*/
public List<Document> createDocuments(String corpusPersistenceID, gate.Document gateDoc, String documentID, List<String> annotSetsToInclude, List<String> annotSetsToExclude, List<String> featuresToInclude, List<String> featuresToExclude, String indexLocation, String baseTokenAnnotationType, Boolean createTokensAutomatically, String indexUnitAnnotationType) {
if (baseTokenAnnotationType != null)
baseTokenAnnotationType = baseTokenAnnotationType.trim();
List<Document> toReturnBack = new ArrayList<Document>();
List<String> annotSetsToIndex = new ArrayList<String>();
// about annotation sets to exclude
if (annotSetsToInclude.size() > 0) {
annotSetsToIndex = annotSetsToInclude;
// if there's only one annotation to index, we don't need to
// create a MergeSet
// if(annotSetsToIndex.size() == 1) createMergeSet = false;
} else if (annotSetsToExclude.size() > 0) {
// if there were no annotation sets to include, check if user has
// provided any annotation sets to exclude
// if so, we need to index all annotation sets but provided in the
// annotationsetstoexclude list
Set<String> namedAnnotSets = new HashSet<String>();
if (gateDoc.getNamedAnnotationSets() != null && gateDoc.getNamedAnnotationSets().keySet() != null) {
namedAnnotSets = gateDoc.getNamedAnnotationSets().keySet();
}
for (String setName : namedAnnotSets) {
if (annotSetsToExclude.contains(setName))
continue;
annotSetsToIndex.add(setName);
}
if (!annotSetsToExclude.contains(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
annotSetsToIndex.add(Constants.DEFAULT_ANNOTATION_SET_NAME);
}
} else {
// if both annotation sets to include and annotation sets to
// exclude are empty
// we need to index all annotation sets
Set<String> namedAnnotSets = new HashSet<String>();
if (gateDoc.getNamedAnnotationSets() != null && gateDoc.getNamedAnnotationSets().keySet() != null) {
namedAnnotSets = gateDoc.getNamedAnnotationSets().keySet();
}
for (String setName : namedAnnotSets) {
annotSetsToIndex.add(setName);
}
annotSetsToIndex.add(Constants.DEFAULT_ANNOTATION_SET_NAME);
}
// lets find out the annotation set that contains tokens in it
AnnotationSet baseTokenAnnotationSet = null;
// search in annotation sets to find out which of them has the
// baseTokenAnnotationType annotations
// initially this is set to false
boolean searchBaseTokensInAllAnnotationSets = false;
boolean searchIndexUnitInAllAnnotationSets = false;
// this variable tells whether we want to create manual tokens or
// not
boolean createManualTokens = false;
// lets check if user's input is setName.basetokenAnnotationType
int index = -1;
if (baseTokenAnnotationType != null && baseTokenAnnotationType.length() > 0)
index = baseTokenAnnotationType.lastIndexOf('.');
// basetokenAnnotationType
if (index >= 0) {
// set name
String setName = baseTokenAnnotationType.substring(0, index);
// token type
baseTokenAnnotationType = baseTokenAnnotationType.substring(index + 1, baseTokenAnnotationType.length());
// annotation set
if (setName.equals(Constants.DEFAULT_ANNOTATION_SET_NAME))
baseTokenAnnotationSet = gateDoc.getAnnotations().get(baseTokenAnnotationType);
else
baseTokenAnnotationSet = gateDoc.getAnnotations(setName).get(baseTokenAnnotationType);
// base token annotation type
if (baseTokenAnnotationSet == null || baseTokenAnnotationSet.size() == 0) {
System.err.println("Base Tokens " + baseTokenAnnotationType + " counldn't be found under the specified annotation set " + setName + "\n searching them in other annotation sets");
searchBaseTokensInAllAnnotationSets = true;
}
} else {
// either baseTokenAnnotation type is null or user hasn't provided
// any annotaiton set name
// so we search in all annotation sets
searchBaseTokensInAllAnnotationSets = true;
}
if (baseTokenAnnotationType != null && baseTokenAnnotationType.length() > 0 && searchBaseTokensInAllAnnotationSets) {
// we set this to true and if we find basetokens in any of the
// annotationsets to index
// we will set this to false
createManualTokens = true;
for (String aSet : annotSetsToIndex) {
if (aSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
AnnotationSet tempSet = gateDoc.getAnnotations().get(baseTokenAnnotationType);
if (tempSet.size() > 0) {
baseTokenAnnotationSet = tempSet;
// System.out.println("found in default annotation set");
createManualTokens = false;
break;
}
} else {
AnnotationSet tempSet = gateDoc.getAnnotations(aSet).get(baseTokenAnnotationType);
if (tempSet.size() > 0) {
baseTokenAnnotationSet = tempSet;
// System.out.println("found in "+aSet);
createManualTokens = false;
break;
}
}
}
}
// we'll have to create tokens ourselves
if (baseTokenAnnotationType == null || baseTokenAnnotationType.length() == 0)
createManualTokens = true;
// lets check if we have to create ManualTokens
if (createManualTokens) {
if (!createTokensAutomatically.booleanValue()) {
System.out.println("Tokens couldn't be found in the document - Ignoring the document " + gateDoc.getName());
return null;
}
baseTokenAnnotationType = Constants.ANNIC_TOKEN;
if (baseTokenAnnotationSet == null) {
baseTokenAnnotationSet = new AnnotationSetImpl(gateDoc);
}
if (!createTokens(gateDoc, baseTokenAnnotationSet)) {
System.out.println("Tokens couldn't be created manually - Ignoring the document " + gateDoc.getName());
return null;
}
}
// by now, baseTokenAnnotationSet will not be null for sure and we
// know what's the baseTokenAnnotationType
// lets find out the annotation set that contains
// indexUnitAnnotationType in it
AnnotationSet indexUnitAnnotationSet = null;
// lets check if user has provided setName.indexUnitAnnotationType
index = -1;
if (indexUnitAnnotationType != null && indexUnitAnnotationType.trim().length() > 0)
index = indexUnitAnnotationType.lastIndexOf('.');
// indexUnitAnnotationType
if (index >= 0) {
// setName
String setName = indexUnitAnnotationType.substring(0, index);
// indexUnitAnnotationType
indexUnitAnnotationType = indexUnitAnnotationType.substring(index + 1, indexUnitAnnotationType.length());
if (setName.equals(Constants.DEFAULT_ANNOTATION_SET_NAME))
indexUnitAnnotationSet = gateDoc.getAnnotations().get(indexUnitAnnotationType);
else
indexUnitAnnotationSet = gateDoc.getAnnotations(setName).get(indexUnitAnnotationType);
// if so, we'll have to search other annotation sets
if (indexUnitAnnotationSet == null || indexUnitAnnotationSet.size() == 0) {
System.err.println("Index Unit " + indexUnitAnnotationType + " counldn't be found under the specified annotation set " + setName + "\n searching them in other annotation sets");
searchIndexUnitInAllAnnotationSets = true;
}
} else {
// either indexUnitAnnotationType is null or user hasn't provided
// the setname
searchIndexUnitInAllAnnotationSets = true;
}
// searching in all annotation set names
if (indexUnitAnnotationType != null && indexUnitAnnotationType.length() > 0 && searchIndexUnitInAllAnnotationSets) {
for (String aSet : annotSetsToIndex) {
if (aSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
AnnotationSet tempSet = gateDoc.getAnnotations().get(indexUnitAnnotationType);
if (tempSet.size() > 0) {
indexUnitAnnotationSet = tempSet;
break;
}
} else {
AnnotationSet tempSet = gateDoc.getAnnotations(aSet).get(indexUnitAnnotationType);
if (tempSet.size() > 0) {
indexUnitAnnotationSet = tempSet;
break;
}
}
}
}
// to null as well
if (indexUnitAnnotationSet == null) {
indexUnitAnnotationType = null;
}
int j = 0;
for (String annotSet : annotSetsToIndex) {
// we need to generate the Token Stream here, and send it to the
// GateLuceneReader
AnnotationSet aSetToIndex = annotSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME) ? gateDoc.getAnnotations() : gateDoc.getAnnotations(annotSet);
Set<String> indexedFeatures = new HashSet<String>();
// tempBaseTokenAnnotationSet is not null
List<Token>[] tokenStreams = getTokens(gateDoc, aSetToIndex, featuresToInclude, featuresToExclude, baseTokenAnnotationType, baseTokenAnnotationSet, indexUnitAnnotationType, indexUnitAnnotationSet, indexedFeatures);
// tokenStream is set to null
if (tokenStreams == null)
return null;
// this is enabled only if there are more than one annotation sets
// available to search in
// if(createMergeSet) {
// if(mergedSet == null) mergedSet = new AnnotationSetImpl(gateDoc);
//
// // we need to merge all annotations but the
// // baseTokenAnnotationType
// for(String aType : aSetToIndex.getAllTypes()) {
//
// if(aType.equals(baseTokenAnnotationType)) {
// continue;
// }
//
// if(indexUnitAnnotationType != null
// && aType.equals(indexUnitAnnotationType)) {
// continue;
// }
//
// for(Annotation a : aSetToIndex.get(aType)) {
// try {
// mergedSet.add(a.getStartNode().getOffset(), a.getEndNode()
// .getOffset(), a.getType(), a.getFeatures());
// }
// catch(InvalidOffsetException ioe) {
// throw new GateRuntimeException(ioe);
// }
// }
//
// }
// }
StringBuffer indexedFeaturesString = new StringBuffer();
for (String aFeat : indexedFeatures) {
indexedFeaturesString.append(aFeat + ";");
}
Document[] toReturn = new Document[tokenStreams.length];
for (int i = 0; i < tokenStreams.length; i++, j++) {
// make a new, empty document
Document doc = new Document();
// and then create the document
LuceneReader reader = new LuceneReader(gateDoc, tokenStreams[i]);
doc.add(Field.Keyword(Constants.DOCUMENT_ID, documentID));
doc.add(Field.Keyword(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE, documentID + "-" + j));
doc.add(Field.Keyword(Constants.INDEXED_FEATURES, indexedFeaturesString.substring(0, indexedFeaturesString.length() - 1)));
if (corpusPersistenceID != null)
doc.add(Field.Keyword(Constants.CORPUS_ID, corpusPersistenceID));
doc.add(Field.Keyword(Constants.ANNOTATION_SET_ID, annotSet));
doc.add(Field.Text("contents", reader));
// here we store token stream on the file system
try {
writeOnDisk(tokenStreams[i], documentID, documentID + "-" + j, indexLocation);
} catch (Exception e) {
Err.println("\nIgnoring the document : " + gateDoc.getName() + " since its token stream cannot be written on the disk");
Err.println("Reason: " + e.getMessage());
return null;
}
// return the document
toReturn[i] = doc;
}
toReturnBack.addAll(Arrays.asList(toReturn));
}
return toReturnBack;
}
use of gate.creole.annic.apache.lucene.document.Document in project gate-core by GateNLP.
the class LuceneIndexer method getNamesOfSerializedFiles.
/**
* This method returns a set of annotation set names that are indexed.
*/
public Set<String> getNamesOfSerializedFiles(String documentID) throws IndexException {
String location = null;
try {
location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
} catch (URISyntaxException use) {
location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
}
Set<String> toReturn = new HashSet<String>();
try {
Term term = new Term(Constants.DOCUMENT_ID, documentID);
TermQuery tq = new TermQuery(term);
gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(location);
try {
// and now execute the query
// result of which will be stored in hits
Hits luceneHits = searcher.search(tq);
for (int i = 0; i < luceneHits.length(); i++) {
Document luceneDoc = luceneHits.doc(i);
String documentIdOfSerializedFile = luceneDoc.get(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE);
toReturn.add(documentIdOfSerializedFile);
}
return toReturn;
} finally {
searcher.close();
}
} catch (IOException ioe) {
throw new IndexException(ioe);
}
}
use of gate.creole.annic.apache.lucene.document.Document in project gate-core by GateNLP.
the class LuceneSearcher method next.
/**
* Return the next numberOfHits -1 indicates all
*/
@Override
public Hit[] next(int numberOfHits) throws SearchException {
annicPatterns = new ArrayList<Pattern>();
if (!success) {
this.annicPatterns = new ArrayList<Pattern>();
return getHits();
}
if (fwdIterationEnded) {
this.annicPatterns = new ArrayList<Pattern>();
return getHits();
}
try {
if (wasDeleteQuery) {
List<String> docIDs = new ArrayList<String>();
List<String> setNames = new ArrayList<String>();
for (int i = 0; i < luceneHits.length(); i++) {
Document luceneDoc = luceneHits.doc(i);
String documentID = luceneDoc.get(Constants.DOCUMENT_ID);
String annotationSetID = luceneDoc.get(Constants.ANNOTATION_SET_ID);
int index = docIDs.indexOf(documentID);
if (index == -1) {
docIDs.add(documentID);
setNames.add(annotationSetID);
} else {
if (!setNames.get(index).equals(annotationSetID)) {
docIDs.add(documentID);
setNames.add(annotationSetID);
}
}
}
Hit[] toReturn = new Hit[docIDs.size()];
for (int i = 0; i < toReturn.length; i++) {
toReturn[i] = new Hit(docIDs.get(i), setNames.get(i), 0, 0, "");
}
return toReturn;
}
for (; luceneSearchThreadIndex < luceneSearchThreads.size(); luceneSearchThreadIndex++) {
LuceneSearchThread lst = luceneSearchThreads.get(luceneSearchThreadIndex);
List<Pattern> results = lst.next(numberOfHits);
if (results != null) {
if (numberOfHits != -1) {
numberOfHits -= results.size();
}
this.annicPatterns.addAll(results);
if (numberOfHits == 0) {
return getHits();
}
}
}
// if we are here, there wer no sufficient patterns available
// so what we do is make success to false so that this method
// return null on next call
fwdIterationEnded = true;
return getHits();
} catch (Exception e) {
throw new SearchException(e);
}
}
use of gate.creole.annic.apache.lucene.document.Document in project gate-core by GateNLP.
the class FieldsReader method doc.
final Document doc(int n) throws IOException {
indexStream.seek(n * 8L);
long position = indexStream.readLong();
fieldsStream.seek(position);
Document doc = new Document();
int numFields = fieldsStream.readVInt();
for (int i = 0; i < numFields; i++) {
int fieldNumber = fieldsStream.readVInt();
FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
byte bits = fieldsStream.readByte();
doc.add(new // name
Field(// name
fi.name, // read value
fieldsStream.readString(), // stored
true, // indexed
fi.isIndexed, (bits & 1) != 0, // vector
fi.storeTermVector));
}
return doc;
}
use of gate.creole.annic.apache.lucene.document.Document in project gate-core by GateNLP.
the class LuceneSearcher method getIndexedAnnotationSetNames.
/**
* This method returns a set of annotation set names that are indexed. Each
* entry has the following format:
* <p>
* corpusName;annotationSetName
* </p>
* where, the corpusName is the name of the corpus the annotationSetName
* belongs to.
*/
@Override
public String[] getIndexedAnnotationSetNames() throws SearchException {
String indexLocation;
try {
indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
} catch (URISyntaxException use) {
indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
}
annotationTypesMap = new HashMap<String, List<String>>();
Set<String> toReturn = new HashSet<String>();
try {
IndexReader reader = IndexReader.open(indexLocation);
try {
// lets first obtain stored corpora
TermEnum terms = reader.terms(new Term(Constants.ANNOTATION_SET_ID, ""));
if (terms == null) {
return new String[0];
}
// iterating over terms and finding out names of annotation sets indexed
Set<String> annotSets = new HashSet<String>();
boolean foundAnnotSet = false;
do {
Term t = terms.term();
if (t == null)
continue;
if (t.field().equals(Constants.ANNOTATION_SET_ID)) {
annotSets.add(t.text());
foundAnnotSet = true;
} else {
if (foundAnnotSet)
break;
}
} while (terms.next());
// but not all documents belong to corpora
for (String annotSet : annotSets) {
Term term = new Term(Constants.ANNOTATION_SET_ID, annotSet);
TermQuery tq = new TermQuery(term);
try {
gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(indexLocation);
try {
Hits annotSetHits = searcher.search(tq);
for (int i = 0; i < annotSetHits.length(); i++) {
Document luceneDoc = annotSetHits.doc(i);
String corpusID = luceneDoc.get(Constants.CORPUS_ID);
if (corpusID == null)
corpusID = "";
toReturn.add(corpusID + ";" + annotSet);
// lets create a boolean query
Term annotSetTerm = new Term(Constants.ANNOTATION_SET_ID, annotSet);
TermQuery atq = new TermQuery(annotSetTerm);
BooleanQuery bq = new BooleanQuery();
bq.add(tq, true, false);
bq.add(atq, true, false);
gate.creole.annic.apache.lucene.search.Searcher indexFeatureSearcher = new IndexSearcher(indexLocation);
try {
Hits indexFeaturesHits = searcher.search(bq);
for (int j = 0; j < indexFeaturesHits.length(); j++) {
Document aDoc = indexFeaturesHits.doc(j);
String indexedFeatures = aDoc.get(Constants.INDEXED_FEATURES);
if (indexedFeatures != null) {
String[] features = indexedFeatures.split(";");
for (String aFeature : features) {
// AnnotationType.FeatureName
int index = aFeature.indexOf(".");
if (index == -1) {
continue;
}
String type = aFeature.substring(0, index);
String featureName = aFeature.substring(index + 1);
String key = corpusID + ";" + annotSet + ";" + type;
List<String> listOfFeatures = annotationTypesMap.get(key);
if (listOfFeatures == null) {
listOfFeatures = new ArrayList<String>();
annotationTypesMap.put(key, listOfFeatures);
}
if (!listOfFeatures.contains(featureName)) {
listOfFeatures.add(featureName);
}
}
}
}
} finally {
indexFeatureSearcher.close();
}
}
} finally {
searcher.close();
}
} catch (IOException ioe) {
ioe.printStackTrace();
throw new SearchException(ioe);
}
}
} finally {
reader.close();
}
} catch (IOException ioe) {
throw new SearchException(ioe);
}
return toReturn.toArray(new String[0]);
}
Aggregations