Search in sources :

Example 1 with IndexException

use of gate.creole.annic.IndexException in project gate-core by GateNLP.

the class LuceneIndexer method add.

/**
 * Add new documents to Index
 * @throws IndexException
 */
@Override
public void add(String corpusPersistenceID, List<gate.Document> added) throws IndexException {
    String location = null;
    // TODO should we use the gate util Files mehotd for this
    try {
        location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
    } catch (URISyntaxException use) {
        location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
    }
    try {
        IndexWriter writer = new IndexWriter(location, new LuceneAnalyzer(), false);
        try {
            if (added != null) {
                for (int i = 0; i < added.size(); i++) {
                    gate.Document gateDoc = added.get(i);
                    String idToUse = gateDoc.getLRPersistenceId() == null ? gateDoc.getName() : gateDoc.getLRPersistenceId().toString();
                    System.out.print("Indexing : " + idToUse + " ...");
                    List<gate.creole.annic.apache.lucene.document.Document> docs = getLuceneDocuments(corpusPersistenceID, gateDoc, location);
                    if (docs == null) {
                        System.out.println("Done");
                        continue;
                    }
                    for (int j = 0; j < docs.size(); j++) {
                        writer.addDocument(docs.get(j));
                    }
                    System.out.println("Done");
                }
            // for (add all added documents)
            }
        } finally {
            // make sure we close the writer, whatever happens
            writer.close();
        }
    } catch (java.io.IOException ioe) {
        throw new IndexException(ioe);
    }
}
Also used : IndexException(gate.creole.annic.IndexException) IOException(java.io.IOException) URISyntaxException(java.net.URISyntaxException) Document(gate.creole.annic.apache.lucene.document.Document) URL(java.net.URL) IndexWriter(gate.creole.annic.apache.lucene.index.IndexWriter) File(java.io.File)

Example 2 with IndexException

use of gate.creole.annic.IndexException in project gate-core by GateNLP.

the class LuceneIndexer method checkIndexParameters.

/**
 * Checks the Index Parameters to see if they are all compatible
 */
protected void checkIndexParameters(Map<String, Object> parameters) throws IndexException {
    this.parameters = parameters;
    if (parameters == null) {
        throw new IndexException("No parameters provided!");
    }
    URL indexLocation = (URL) parameters.get(Constants.INDEX_LOCATION_URL);
    if (indexLocation == null)
        throw new IndexException("You must provide a URL for INDEX_LOCATION");
    if (!indexLocation.getProtocol().equalsIgnoreCase("file")) {
        throw new IndexException("Index Output Directory must be set to the empty directory on the file system");
    }
    File file = null;
    try {
        file = new File(indexLocation.toURI());
    } catch (URISyntaxException use) {
        file = Files.fileFromURL(indexLocation);
    }
    if (file.exists()) {
        if (!file.isDirectory()) {
            throw new IndexException("Path doesn't exist");
        }
    }
    String baseTokenAnnotationType = (String) parameters.get(Constants.BASE_TOKEN_ANNOTATION_TYPE);
    if (baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length() == 0) {
        baseTokenAnnotationType = Constants.ANNIC_TOKEN;
        parameters.put(Constants.BASE_TOKEN_ANNOTATION_TYPE, Constants.ANNIC_TOKEN);
    } else if (baseTokenAnnotationType.indexOf(".") > -1 || baseTokenAnnotationType.indexOf("=") > -1 || baseTokenAnnotationType.indexOf(";") > -1 || baseTokenAnnotationType.indexOf(",") > -1) {
        throw new IndexException("Base token annotation type cannot have '.' , '=', ',' or ';; in it");
    }
    String indexUnitAnnotationType = (String) parameters.get(Constants.INDEX_UNIT_ANNOTATION_TYPE);
    if (DEBUG) {
        System.out.println("BTAT : " + baseTokenAnnotationType);
        System.out.println("IUAT : " + indexUnitAnnotationType);
    }
}
Also used : IndexException(gate.creole.annic.IndexException) URISyntaxException(java.net.URISyntaxException) File(java.io.File) URL(java.net.URL)

Example 3 with IndexException

use of gate.creole.annic.IndexException in project gate-core by GateNLP.

the class LuceneIndexer method getNamesOfSerializedFiles.

/**
 * This method returns a set of annotation set names that are indexed.
 */
public Set<String> getNamesOfSerializedFiles(String documentID) throws IndexException {
    String location = null;
    try {
        location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
    } catch (URISyntaxException use) {
        location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
    }
    Set<String> toReturn = new HashSet<String>();
    try {
        Term term = new Term(Constants.DOCUMENT_ID, documentID);
        TermQuery tq = new TermQuery(term);
        gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(location);
        try {
            // and now execute the query
            // result of which will be stored in hits
            Hits luceneHits = searcher.search(tq);
            for (int i = 0; i < luceneHits.length(); i++) {
                Document luceneDoc = luceneHits.doc(i);
                String documentIdOfSerializedFile = luceneDoc.get(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE);
                toReturn.add(documentIdOfSerializedFile);
            }
            return toReturn;
        } finally {
            searcher.close();
        }
    } catch (IOException ioe) {
        throw new IndexException(ioe);
    }
}
Also used : IndexSearcher(gate.creole.annic.apache.lucene.search.IndexSearcher) TermQuery(gate.creole.annic.apache.lucene.search.TermQuery) Hits(gate.creole.annic.apache.lucene.search.Hits) IndexException(gate.creole.annic.IndexException) URISyntaxException(java.net.URISyntaxException) Term(gate.creole.annic.apache.lucene.index.Term) IOException(java.io.IOException) Document(gate.creole.annic.apache.lucene.document.Document) URL(java.net.URL) File(java.io.File) HashSet(java.util.HashSet)

Example 4 with IndexException

use of gate.creole.annic.IndexException in project gate-core by GateNLP.

the class LuceneIndexer method remove.

/**
 * remove documents from the Index
 *
 * @param removedIDs - when documents are not
 *          peristed, Persistence IDs will not be available In that
 *          case provide the document Names instead of their IDs
 * @throws Exception
 */
@Override
public void remove(List<Object> removedIDs) throws IndexException {
    String location = null;
    try {
        location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
    } catch (URISyntaxException use) {
        location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
    }
    try {
        IndexReader reader = IndexReader.open(location);
        try {
            // let us first remove the documents which need to be removed
            if (removedIDs != null) {
                for (int i = 0; i < removedIDs.size(); i++) {
                    String id = removedIDs.get(i).toString();
                    Set<String> serializedFilesIDs = getNamesOfSerializedFiles(id);
                    if (serializedFilesIDs.size() > 0) {
                        System.out.print("Removing => " + id + "...");
                        id = getCompatibleName(id);
                        File file = new File(location, Constants.SERIALIZED_FOLDER_NAME);
                        file = new File(file, id);
                        for (String serializedFileID : serializedFilesIDs) {
                            gate.creole.annic.apache.lucene.index.Term term = new gate.creole.annic.apache.lucene.index.Term(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE, serializedFileID);
                            reader.delete(term);
                            serializedFileID = getCompatibleName(serializedFileID);
                            // deleting them from the disk as well
                            // we have a subfolder for each document
                            File toDelete = new File(file, serializedFileID + ".annic");
                            if (toDelete.exists())
                                toDelete.delete();
                        }
                        if (file.exists() && file.isDirectory()) {
                            file.delete();
                        }
                        System.out.println("Done ");
                    }
                }
            // for (remove all removed documents)
            }
        } finally {
            reader.close();
        }
    } catch (java.io.IOException ioe) {
        throw new IndexException(ioe);
    }
}
Also used : IndexException(gate.creole.annic.IndexException) IOException(java.io.IOException) URISyntaxException(java.net.URISyntaxException) Term(gate.creole.annic.apache.lucene.index.Term) URL(java.net.URL) Term(gate.creole.annic.apache.lucene.index.Term) IndexReader(gate.creole.annic.apache.lucene.index.IndexReader) File(java.io.File)

Example 5 with IndexException

use of gate.creole.annic.IndexException in project gate-core by GateNLP.

the class LuceneDataStoreImpl method documentRemoved.

// Corpus Events
/**
 * This method is invoked whenever a document is removed from a corpus
 */
@Override
public void documentRemoved(CorpusEvent ce) {
    Object docLRID = ce.getDocumentLRID();
    /*
     * we need to remove this document from the index
     */
    if (docLRID != null) {
        ArrayList<Object> removed = new ArrayList<Object>();
        removed.add(docLRID);
        try {
            synchronized (indexer) {
                indexer.remove(removed);
            }
        } catch (IndexException ie) {
            throw new GateRuntimeException(ie);
        }
    // queueForIndexing(docLRID);
    }
}
Also used : IndexException(gate.creole.annic.IndexException) GateRuntimeException(gate.util.GateRuntimeException) ArrayList(java.util.ArrayList)

Aggregations

IndexException (gate.creole.annic.IndexException)8 File (java.io.File)6 IOException (java.io.IOException)6 URL (java.net.URL)6 URISyntaxException (java.net.URISyntaxException)5 Document (gate.creole.annic.apache.lucene.document.Document)3 IndexWriter (gate.creole.annic.apache.lucene.index.IndexWriter)3 Term (gate.creole.annic.apache.lucene.index.Term)2 IndexReader (gate.creole.annic.apache.lucene.index.IndexReader)1 Hits (gate.creole.annic.apache.lucene.search.Hits)1 IndexSearcher (gate.creole.annic.apache.lucene.search.IndexSearcher)1 TermQuery (gate.creole.annic.apache.lucene.search.TermQuery)1 GateRuntimeException (gate.util.GateRuntimeException)1 FileOutputStream (java.io.FileOutputStream)1 OutputStreamWriter (java.io.OutputStreamWriter)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1