Examples with Document - gate.creole.annic.apache.lucene.document.Document

Example 1 with Document

use of gate.creole.annic.apache.lucene.document.Document in project gate-core by GateNLP.

the class LuceneDocument method createDocuments.

/**
 * Given an instance of Gate Document, it converts it into the format that
 * lucene can understand and can store in its indexes. This method also stores
 * the tokenStream on the disk in order to retrieve it at the time of
 * searching
 */
public List<Document> createDocuments(String corpusPersistenceID, gate.Document gateDoc, String documentID, List<String> annotSetsToInclude, List<String> annotSetsToExclude, List<String> featuresToInclude, List<String> featuresToExclude, String indexLocation, String baseTokenAnnotationType, Boolean createTokensAutomatically, String indexUnitAnnotationType) {
    if (baseTokenAnnotationType != null)
        baseTokenAnnotationType = baseTokenAnnotationType.trim();
    List<Document> toReturnBack = new ArrayList<Document>();
    List<String> annotSetsToIndex = new ArrayList<String>();
    // about annotation sets to exclude
    if (annotSetsToInclude.size() > 0) {
        annotSetsToIndex = annotSetsToInclude;
    // if there's only one annotation to index, we don't need to
    // create a MergeSet
    // if(annotSetsToIndex.size() == 1) createMergeSet = false;
    } else if (annotSetsToExclude.size() > 0) {
        // if there were no annotation sets to include, check if user has
        // provided any annotation sets to exclude
        // if so, we need to index all annotation sets but provided in the
        // annotationsetstoexclude list
        Set<String> namedAnnotSets = new HashSet<String>();
        if (gateDoc.getNamedAnnotationSets() != null && gateDoc.getNamedAnnotationSets().keySet() != null) {
            namedAnnotSets = gateDoc.getNamedAnnotationSets().keySet();
        }
        for (String setName : namedAnnotSets) {
            if (annotSetsToExclude.contains(setName))
                continue;
            annotSetsToIndex.add(setName);
        }
        if (!annotSetsToExclude.contains(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
            annotSetsToIndex.add(Constants.DEFAULT_ANNOTATION_SET_NAME);
        }
    } else {
        // if both annotation sets to include and annotation sets to
        // exclude are empty
        // we need to index all annotation sets
        Set<String> namedAnnotSets = new HashSet<String>();
        if (gateDoc.getNamedAnnotationSets() != null && gateDoc.getNamedAnnotationSets().keySet() != null) {
            namedAnnotSets = gateDoc.getNamedAnnotationSets().keySet();
        }
        for (String setName : namedAnnotSets) {
            annotSetsToIndex.add(setName);
        }
        annotSetsToIndex.add(Constants.DEFAULT_ANNOTATION_SET_NAME);
    }
    // lets find out the annotation set that contains tokens in it
    AnnotationSet baseTokenAnnotationSet = null;
    // search in annotation sets to find out which of them has the
    // baseTokenAnnotationType annotations
    // initially this is set to false
    boolean searchBaseTokensInAllAnnotationSets = false;
    boolean searchIndexUnitInAllAnnotationSets = false;
    // this variable tells whether we want to create manual tokens or
    // not
    boolean createManualTokens = false;
    // lets check if user's input is setName.basetokenAnnotationType
    int index = -1;
    if (baseTokenAnnotationType != null && baseTokenAnnotationType.length() > 0)
        index = baseTokenAnnotationType.lastIndexOf('.');
    // basetokenAnnotationType
    if (index >= 0) {
        // set name
        String setName = baseTokenAnnotationType.substring(0, index);
        // token type
        baseTokenAnnotationType = baseTokenAnnotationType.substring(index + 1, baseTokenAnnotationType.length());
        // annotation set
        if (setName.equals(Constants.DEFAULT_ANNOTATION_SET_NAME))
            baseTokenAnnotationSet = gateDoc.getAnnotations().get(baseTokenAnnotationType);
        else
            baseTokenAnnotationSet = gateDoc.getAnnotations(setName).get(baseTokenAnnotationType);
        // base token annotation type
        if (baseTokenAnnotationSet == null || baseTokenAnnotationSet.size() == 0) {
            System.err.println("Base Tokens " + baseTokenAnnotationType + " counldn't be found under the specified annotation set " + setName + "\n searching them in other annotation sets");
            searchBaseTokensInAllAnnotationSets = true;
        }
    } else {
        // either baseTokenAnnotation type is null or user hasn't provided
        // any annotaiton set name
        // so we search in all annotation sets
        searchBaseTokensInAllAnnotationSets = true;
    }
    if (baseTokenAnnotationType != null && baseTokenAnnotationType.length() > 0 && searchBaseTokensInAllAnnotationSets) {
        // we set this to true and if we find basetokens in any of the
        // annotationsets to index
        // we will set this to false
        createManualTokens = true;
        for (String aSet : annotSetsToIndex) {
            if (aSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
                AnnotationSet tempSet = gateDoc.getAnnotations().get(baseTokenAnnotationType);
                if (tempSet.size() > 0) {
                    baseTokenAnnotationSet = tempSet;
                    // System.out.println("found in default annotation set");
                    createManualTokens = false;
                    break;
                }
            } else {
                AnnotationSet tempSet = gateDoc.getAnnotations(aSet).get(baseTokenAnnotationType);
                if (tempSet.size() > 0) {
                    baseTokenAnnotationSet = tempSet;
                    // System.out.println("found in "+aSet);
                    createManualTokens = false;
                    break;
                }
            }
        }
    }
    // we'll have to create tokens ourselves
    if (baseTokenAnnotationType == null || baseTokenAnnotationType.length() == 0)
        createManualTokens = true;
    // lets check if we have to create ManualTokens
    if (createManualTokens) {
        if (!createTokensAutomatically.booleanValue()) {
            System.out.println("Tokens couldn't be found in the document - Ignoring the document " + gateDoc.getName());
            return null;
        }
        baseTokenAnnotationType = Constants.ANNIC_TOKEN;
        if (baseTokenAnnotationSet == null) {
            baseTokenAnnotationSet = new AnnotationSetImpl(gateDoc);
        }
        if (!createTokens(gateDoc, baseTokenAnnotationSet)) {
            System.out.println("Tokens couldn't be created manually - Ignoring the document " + gateDoc.getName());
            return null;
        }
    }
    // by now, baseTokenAnnotationSet will not be null for sure and we
    // know what's the baseTokenAnnotationType
    // lets find out the annotation set that contains
    // indexUnitAnnotationType in it
    AnnotationSet indexUnitAnnotationSet = null;
    // lets check if user has provided setName.indexUnitAnnotationType
    index = -1;
    if (indexUnitAnnotationType != null && indexUnitAnnotationType.trim().length() > 0)
        index = indexUnitAnnotationType.lastIndexOf('.');
    // indexUnitAnnotationType
    if (index >= 0) {
        // setName
        String setName = indexUnitAnnotationType.substring(0, index);
        // indexUnitAnnotationType
        indexUnitAnnotationType = indexUnitAnnotationType.substring(index + 1, indexUnitAnnotationType.length());
        if (setName.equals(Constants.DEFAULT_ANNOTATION_SET_NAME))
            indexUnitAnnotationSet = gateDoc.getAnnotations().get(indexUnitAnnotationType);
        else
            indexUnitAnnotationSet = gateDoc.getAnnotations(setName).get(indexUnitAnnotationType);
        // if so, we'll have to search other annotation sets
        if (indexUnitAnnotationSet == null || indexUnitAnnotationSet.size() == 0) {
            System.err.println("Index Unit " + indexUnitAnnotationType + " counldn't be found under the specified annotation set " + setName + "\n searching them in other annotation sets");
            searchIndexUnitInAllAnnotationSets = true;
        }
    } else {
        // either indexUnitAnnotationType is null or user hasn't provided
        // the setname
        searchIndexUnitInAllAnnotationSets = true;
    }
    // searching in all annotation set names
    if (indexUnitAnnotationType != null && indexUnitAnnotationType.length() > 0 && searchIndexUnitInAllAnnotationSets) {
        for (String aSet : annotSetsToIndex) {
            if (aSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
                AnnotationSet tempSet = gateDoc.getAnnotations().get(indexUnitAnnotationType);
                if (tempSet.size() > 0) {
                    indexUnitAnnotationSet = tempSet;
                    break;
                }
            } else {
                AnnotationSet tempSet = gateDoc.getAnnotations(aSet).get(indexUnitAnnotationType);
                if (tempSet.size() > 0) {
                    indexUnitAnnotationSet = tempSet;
                    break;
                }
            }
        }
    }
    // to null as well
    if (indexUnitAnnotationSet == null) {
        indexUnitAnnotationType = null;
    }
    int j = 0;
    for (String annotSet : annotSetsToIndex) {
        // we need to generate the Token Stream here, and send it to the
        // GateLuceneReader
        AnnotationSet aSetToIndex = annotSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME) ? gateDoc.getAnnotations() : gateDoc.getAnnotations(annotSet);
        Set<String> indexedFeatures = new HashSet<String>();
        // tempBaseTokenAnnotationSet is not null
        List<Token>[] tokenStreams = getTokens(gateDoc, aSetToIndex, featuresToInclude, featuresToExclude, baseTokenAnnotationType, baseTokenAnnotationSet, indexUnitAnnotationType, indexUnitAnnotationSet, indexedFeatures);
        // tokenStream is set to null
        if (tokenStreams == null)
            return null;
        // this is enabled only if there are more than one annotation sets
        // available to search in
        // if(createMergeSet) {
        // if(mergedSet == null) mergedSet = new AnnotationSetImpl(gateDoc);
        // 
        // // we need to merge all annotations but the
        // // baseTokenAnnotationType
        // for(String aType : aSetToIndex.getAllTypes()) {
        // 
        // if(aType.equals(baseTokenAnnotationType)) {
        // continue;
        // }
        // 
        // if(indexUnitAnnotationType != null
        // && aType.equals(indexUnitAnnotationType)) {
        // continue;
        // }
        // 
        // for(Annotation a : aSetToIndex.get(aType)) {
        // try {
        // mergedSet.add(a.getStartNode().getOffset(), a.getEndNode()
        // .getOffset(), a.getType(), a.getFeatures());
        // }
        // catch(InvalidOffsetException ioe) {
        // throw new GateRuntimeException(ioe);
        // }
        // }
        // 
        // }
        // }
        StringBuffer indexedFeaturesString = new StringBuffer();
        for (String aFeat : indexedFeatures) {
            indexedFeaturesString.append(aFeat + ";");
        }
        Document[] toReturn = new Document[tokenStreams.length];
        for (int i = 0; i < tokenStreams.length; i++, j++) {
            // make a new, empty document
            Document doc = new Document();
            // and then create the document
            LuceneReader reader = new LuceneReader(gateDoc, tokenStreams[i]);
            doc.add(Field.Keyword(Constants.DOCUMENT_ID, documentID));
            doc.add(Field.Keyword(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE, documentID + "-" + j));
            doc.add(Field.Keyword(Constants.INDEXED_FEATURES, indexedFeaturesString.substring(0, indexedFeaturesString.length() - 1)));
            if (corpusPersistenceID != null)
                doc.add(Field.Keyword(Constants.CORPUS_ID, corpusPersistenceID));
            doc.add(Field.Keyword(Constants.ANNOTATION_SET_ID, annotSet));
            doc.add(Field.Text("contents", reader));
            // here we store token stream on the file system
            try {
                writeOnDisk(tokenStreams[i], documentID, documentID + "-" + j, indexLocation);
            } catch (Exception e) {
                Err.println("\nIgnoring the document : " + gateDoc.getName() + " since its token stream cannot be written on the disk");
                Err.println("Reason: " + e.getMessage());
                return null;
            }
            // return the document
            toReturn[i] = doc;
        }
        toReturnBack.addAll(Arrays.asList(toReturn));
    }
    return toReturnBack;
}

Also used : HashSet(java.util.HashSet) AnnotationSet(gate.AnnotationSet) Set(java.util.Set) ArrayList(java.util.ArrayList) AnnotationSet(gate.AnnotationSet) Document(gate.creole.annic.apache.lucene.document.Document) InvalidOffsetException(gate.util.InvalidOffsetException) GateRuntimeException(gate.util.GateRuntimeException) IOException(java.io.IOException) AnnotationSetImpl(gate.annotation.AnnotationSetImpl) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet)

Example 2 with Document

use of gate.creole.annic.apache.lucene.document.Document in project gate-core by GateNLP.

the class LuceneIndexer method getNamesOfSerializedFiles.

/**
 * This method returns a set of annotation set names that are indexed.
 */
public Set<String> getNamesOfSerializedFiles(String documentID) throws IndexException {
    String location = null;
    try {
        location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
    } catch (URISyntaxException use) {
        location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
    }
    Set<String> toReturn = new HashSet<String>();
    try {
        Term term = new Term(Constants.DOCUMENT_ID, documentID);
        TermQuery tq = new TermQuery(term);
        gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(location);
        try {
            // and now execute the query
            // result of which will be stored in hits
            Hits luceneHits = searcher.search(tq);
            for (int i = 0; i < luceneHits.length(); i++) {
                Document luceneDoc = luceneHits.doc(i);
                String documentIdOfSerializedFile = luceneDoc.get(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE);
                toReturn.add(documentIdOfSerializedFile);
            }
            return toReturn;
        } finally {
            searcher.close();
        }
    } catch (IOException ioe) {
        throw new IndexException(ioe);
    }
}

Also used : IndexSearcher(gate.creole.annic.apache.lucene.search.IndexSearcher) TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) Hits(gate.creole.annic.apache.lucene.search.Hits) IndexException(gate.creole.annic.IndexException) URISyntaxException(java.net.URISyntaxException) Term(gate.creole.annic.apache.lucene.index.Term) IOException(java.io.IOException) Document(gate.creole.annic.apache.lucene.document.Document) URL(java.net.URL) File(java.io.File) HashSet(java.util.HashSet)

Example 3 with Document

use of gate.creole.annic.apache.lucene.document.Document in project gate-core by GateNLP.

the class LuceneSearcher method next.

/**
 * Return the next numberOfHits -1 indicates all
 */
@Override
public Hit[] next(int numberOfHits) throws SearchException {
    annicPatterns = new ArrayList<Pattern>();
    if (!success) {
        this.annicPatterns = new ArrayList<Pattern>();
        return getHits();
    }
    if (fwdIterationEnded) {
        this.annicPatterns = new ArrayList<Pattern>();
        return getHits();
    }
    try {
        if (wasDeleteQuery) {
            List<String> docIDs = new ArrayList<String>();
            List<String> setNames = new ArrayList<String>();
            for (int i = 0; i < luceneHits.length(); i++) {
                Document luceneDoc = luceneHits.doc(i);
                String documentID = luceneDoc.get(Constants.DOCUMENT_ID);
                String annotationSetID = luceneDoc.get(Constants.ANNOTATION_SET_ID);
                int index = docIDs.indexOf(documentID);
                if (index == -1) {
                    docIDs.add(documentID);
                    setNames.add(annotationSetID);
                } else {
                    if (!setNames.get(index).equals(annotationSetID)) {
                        docIDs.add(documentID);
                        setNames.add(annotationSetID);
                    }
                }
            }
            Hit[] toReturn = new Hit[docIDs.size()];
            for (int i = 0; i < toReturn.length; i++) {
                toReturn[i] = new Hit(docIDs.get(i), setNames.get(i), 0, 0, "");
            }
            return toReturn;
        }
        for (; luceneSearchThreadIndex < luceneSearchThreads.size(); luceneSearchThreadIndex++) {
            LuceneSearchThread lst = luceneSearchThreads.get(luceneSearchThreadIndex);
            List<Pattern> results = lst.next(numberOfHits);
            if (results != null) {
                if (numberOfHits != -1) {
                    numberOfHits -= results.size();
                }
                this.annicPatterns.addAll(results);
                if (numberOfHits == 0) {
                    return getHits();
                }
            }
        }
        // if we are here, there wer no sufficient patterns available
        // so what we do is make success to false so that this method
        // return null on next call
        fwdIterationEnded = true;
        return getHits();
    } catch (Exception e) {
        throw new SearchException(e);
    }
}

Also used : Pattern(gate.creole.annic.Pattern) ArrayList(java.util.ArrayList) SearchException(gate.creole.annic.SearchException) Document(gate.creole.annic.apache.lucene.document.Document) URISyntaxException(java.net.URISyntaxException) IOException(java.io.IOException) SearchException(gate.creole.annic.SearchException) Hit(gate.creole.annic.Hit)

Example 4 with Document

use of gate.creole.annic.apache.lucene.document.Document in project gate-core by GateNLP.

the class FieldsReader method doc.

final Document doc(int n) throws IOException {
    indexStream.seek(n * 8L);
    long position = indexStream.readLong();
    fieldsStream.seek(position);
    Document doc = new Document();
    int numFields = fieldsStream.readVInt();
    for (int i = 0; i < numFields; i++) {
        int fieldNumber = fieldsStream.readVInt();
        FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
        byte bits = fieldsStream.readByte();
        doc.add(new // name
        Field(// name
        fi.name, // read value
        fieldsStream.readString(), // stored
        true, // indexed
        fi.isIndexed, (bits & 1) != 0, // vector
        fi.storeTermVector));
    }
    return doc;
}

Also used : Document(gate.creole.annic.apache.lucene.document.Document)

Example 5 with Document

use of gate.creole.annic.apache.lucene.document.Document in project gate-core by GateNLP.

the class LuceneSearcher method getIndexedAnnotationSetNames.

/**
 * This method returns a set of annotation set names that are indexed. Each
 * entry has the following format:
 * <p>
 * corpusName;annotationSetName
 * </p>
 * where, the corpusName is the name of the corpus the annotationSetName
 * belongs to.
 */
@Override
public String[] getIndexedAnnotationSetNames() throws SearchException {
    String indexLocation;
    try {
        indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
    } catch (URISyntaxException use) {
        indexLocation = new File(((URL) datastore.getIndexer().getParameters().get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
    }
    annotationTypesMap = new HashMap<String, List<String>>();
    Set<String> toReturn = new HashSet<String>();
    try {
        IndexReader reader = IndexReader.open(indexLocation);
        try {
            // lets first obtain stored corpora
            TermEnum terms = reader.terms(new Term(Constants.ANNOTATION_SET_ID, ""));
            if (terms == null) {
                return new String[0];
            }
            // iterating over terms and finding out names of annotation sets indexed
            Set<String> annotSets = new HashSet<String>();
            boolean foundAnnotSet = false;
            do {
                Term t = terms.term();
                if (t == null)
                    continue;
                if (t.field().equals(Constants.ANNOTATION_SET_ID)) {
                    annotSets.add(t.text());
                    foundAnnotSet = true;
                } else {
                    if (foundAnnotSet)
                        break;
                }
            } while (terms.next());
            // but not all documents belong to corpora
            for (String annotSet : annotSets) {
                Term term = new Term(Constants.ANNOTATION_SET_ID, annotSet);
                TermQuery tq = new TermQuery(term);
                try {
                    gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(indexLocation);
                    try {
                        Hits annotSetHits = searcher.search(tq);
                        for (int i = 0; i < annotSetHits.length(); i++) {
                            Document luceneDoc = annotSetHits.doc(i);
                            String corpusID = luceneDoc.get(Constants.CORPUS_ID);
                            if (corpusID == null)
                                corpusID = "";
                            toReturn.add(corpusID + ";" + annotSet);
                            // lets create a boolean query
                            Term annotSetTerm = new Term(Constants.ANNOTATION_SET_ID, annotSet);
                            TermQuery atq = new TermQuery(annotSetTerm);
                            BooleanQuery bq = new BooleanQuery();
                            bq.add(tq, true, false);
                            bq.add(atq, true, false);
                            gate.creole.annic.apache.lucene.search.Searcher indexFeatureSearcher = new IndexSearcher(indexLocation);
                            try {
                                Hits indexFeaturesHits = searcher.search(bq);
                                for (int j = 0; j < indexFeaturesHits.length(); j++) {
                                    Document aDoc = indexFeaturesHits.doc(j);
                                    String indexedFeatures = aDoc.get(Constants.INDEXED_FEATURES);
                                    if (indexedFeatures != null) {
                                        String[] features = indexedFeatures.split(";");
                                        for (String aFeature : features) {
                                            // AnnotationType.FeatureName
                                            int index = aFeature.indexOf(".");
                                            if (index == -1) {
                                                continue;
                                            }
                                            String type = aFeature.substring(0, index);
                                            String featureName = aFeature.substring(index + 1);
                                            String key = corpusID + ";" + annotSet + ";" + type;
                                            List<String> listOfFeatures = annotationTypesMap.get(key);
                                            if (listOfFeatures == null) {
                                                listOfFeatures = new ArrayList<String>();
                                                annotationTypesMap.put(key, listOfFeatures);
                                            }
                                            if (!listOfFeatures.contains(featureName)) {
                                                listOfFeatures.add(featureName);
                                            }
                                        }
                                    }
                                }
                            } finally {
                                indexFeatureSearcher.close();
                            }
                        }
                    } finally {
                        searcher.close();
                    }
                } catch (IOException ioe) {
                    ioe.printStackTrace();
                    throw new SearchException(ioe);
                }
            }
        } finally {
            reader.close();
        }
    } catch (IOException ioe) {
        throw new SearchException(ioe);
    }
    return toReturn.toArray(new String[0]);
}

Also used : IndexSearcher(gate.creole.annic.apache.lucene.search.IndexSearcher) BooleanQuery(gate.creole.annic.apache.lucene.search.BooleanQuery) Hits(gate.creole.annic.apache.lucene.search.Hits) SearchException(gate.creole.annic.SearchException) URISyntaxException(java.net.URISyntaxException) TermEnum(gate.creole.annic.apache.lucene.index.TermEnum) Document(gate.creole.annic.apache.lucene.document.Document) URL(java.net.URL) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) Term(gate.creole.annic.apache.lucene.index.Term) IOException(java.io.IOException) IndexReader(gate.creole.annic.apache.lucene.index.IndexReader) File(java.io.File)

Aggregations

Document (gate.creole.annic.apache.lucene.document.Document)5 IOException (java.io.IOException)4 URISyntaxException (java.net.URISyntaxException)3 ArrayList (java.util.ArrayList)3 HashSet (java.util.HashSet)3 SearchException (gate.creole.annic.SearchException)2 Term (gate.creole.annic.apache.lucene.index.Term)2 Hits (gate.creole.annic.apache.lucene.search.Hits)2 IndexSearcher (gate.creole.annic.apache.lucene.search.IndexSearcher)2 TermQuery (gate.creole.annic.apache.lucene.search.TermQuery)2 File (java.io.File)2 URL (java.net.URL)2 List (java.util.List)2 AnnotationSet (gate.AnnotationSet)1 AnnotationSetImpl (gate.annotation.AnnotationSetImpl)1 Hit (gate.creole.annic.Hit)1 IndexException (gate.creole.annic.IndexException)1 Pattern (gate.creole.annic.Pattern)1 IndexReader (gate.creole.annic.apache.lucene.index.IndexReader)1 TermEnum (gate.creole.annic.apache.lucene.index.TermEnum)1