use of gate.creole.annic.IndexException in project gate-core by GateNLP.
the class LuceneIndexer method add.
/**
* Add new documents to Index
* @throws IndexException
*/
@Override
public void add(String corpusPersistenceID, List<gate.Document> added) throws IndexException {
String location = null;
// TODO should we use the gate util Files mehotd for this
try {
location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
} catch (URISyntaxException use) {
location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
}
try {
IndexWriter writer = new IndexWriter(location, new LuceneAnalyzer(), false);
try {
if (added != null) {
for (int i = 0; i < added.size(); i++) {
gate.Document gateDoc = added.get(i);
String idToUse = gateDoc.getLRPersistenceId() == null ? gateDoc.getName() : gateDoc.getLRPersistenceId().toString();
System.out.print("Indexing : " + idToUse + " ...");
List<gate.creole.annic.apache.lucene.document.Document> docs = getLuceneDocuments(corpusPersistenceID, gateDoc, location);
if (docs == null) {
System.out.println("Done");
continue;
}
for (int j = 0; j < docs.size(); j++) {
writer.addDocument(docs.get(j));
}
System.out.println("Done");
}
// for (add all added documents)
}
} finally {
// make sure we close the writer, whatever happens
writer.close();
}
} catch (java.io.IOException ioe) {
throw new IndexException(ioe);
}
}
use of gate.creole.annic.IndexException in project gate-core by GateNLP.
the class LuceneIndexer method checkIndexParameters.
/**
* Checks the Index Parameters to see if they are all compatible
*/
protected void checkIndexParameters(Map<String, Object> parameters) throws IndexException {
this.parameters = parameters;
if (parameters == null) {
throw new IndexException("No parameters provided!");
}
URL indexLocation = (URL) parameters.get(Constants.INDEX_LOCATION_URL);
if (indexLocation == null)
throw new IndexException("You must provide a URL for INDEX_LOCATION");
if (!indexLocation.getProtocol().equalsIgnoreCase("file")) {
throw new IndexException("Index Output Directory must be set to the empty directory on the file system");
}
File file = null;
try {
file = new File(indexLocation.toURI());
} catch (URISyntaxException use) {
file = Files.fileFromURL(indexLocation);
}
if (file.exists()) {
if (!file.isDirectory()) {
throw new IndexException("Path doesn't exist");
}
}
String baseTokenAnnotationType = (String) parameters.get(Constants.BASE_TOKEN_ANNOTATION_TYPE);
if (baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length() == 0) {
baseTokenAnnotationType = Constants.ANNIC_TOKEN;
parameters.put(Constants.BASE_TOKEN_ANNOTATION_TYPE, Constants.ANNIC_TOKEN);
} else if (baseTokenAnnotationType.indexOf(".") > -1 || baseTokenAnnotationType.indexOf("=") > -1 || baseTokenAnnotationType.indexOf(";") > -1 || baseTokenAnnotationType.indexOf(",") > -1) {
throw new IndexException("Base token annotation type cannot have '.' , '=', ',' or ';; in it");
}
String indexUnitAnnotationType = (String) parameters.get(Constants.INDEX_UNIT_ANNOTATION_TYPE);
if (DEBUG) {
System.out.println("BTAT : " + baseTokenAnnotationType);
System.out.println("IUAT : " + indexUnitAnnotationType);
}
}
use of gate.creole.annic.IndexException in project gate-core by GateNLP.
the class LuceneIndexer method getNamesOfSerializedFiles.
/**
* This method returns a set of annotation set names that are indexed.
*/
public Set<String> getNamesOfSerializedFiles(String documentID) throws IndexException {
String location = null;
try {
location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
} catch (URISyntaxException use) {
location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
}
Set<String> toReturn = new HashSet<String>();
try {
Term term = new Term(Constants.DOCUMENT_ID, documentID);
TermQuery tq = new TermQuery(term);
gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(location);
try {
// and now execute the query
// result of which will be stored in hits
Hits luceneHits = searcher.search(tq);
for (int i = 0; i < luceneHits.length(); i++) {
Document luceneDoc = luceneHits.doc(i);
String documentIdOfSerializedFile = luceneDoc.get(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE);
toReturn.add(documentIdOfSerializedFile);
}
return toReturn;
} finally {
searcher.close();
}
} catch (IOException ioe) {
throw new IndexException(ioe);
}
}
use of gate.creole.annic.IndexException in project gate-core by GateNLP.
the class LuceneIndexer method remove.
/**
* remove documents from the Index
*
* @param removedIDs - when documents are not
* peristed, Persistence IDs will not be available In that
* case provide the document Names instead of their IDs
* @throws Exception
*/
@Override
public void remove(List<Object> removedIDs) throws IndexException {
String location = null;
try {
location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
} catch (URISyntaxException use) {
location = new File(((URL) parameters.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
}
try {
IndexReader reader = IndexReader.open(location);
try {
// let us first remove the documents which need to be removed
if (removedIDs != null) {
for (int i = 0; i < removedIDs.size(); i++) {
String id = removedIDs.get(i).toString();
Set<String> serializedFilesIDs = getNamesOfSerializedFiles(id);
if (serializedFilesIDs.size() > 0) {
System.out.print("Removing => " + id + "...");
id = getCompatibleName(id);
File file = new File(location, Constants.SERIALIZED_FOLDER_NAME);
file = new File(file, id);
for (String serializedFileID : serializedFilesIDs) {
gate.creole.annic.apache.lucene.index.Term term = new gate.creole.annic.apache.lucene.index.Term(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE, serializedFileID);
reader.delete(term);
serializedFileID = getCompatibleName(serializedFileID);
// deleting them from the disk as well
// we have a subfolder for each document
File toDelete = new File(file, serializedFileID + ".annic");
if (toDelete.exists())
toDelete.delete();
}
if (file.exists() && file.isDirectory()) {
file.delete();
}
System.out.println("Done ");
}
}
// for (remove all removed documents)
}
} finally {
reader.close();
}
} catch (java.io.IOException ioe) {
throw new IndexException(ioe);
}
}
use of gate.creole.annic.IndexException in project gate-core by GateNLP.
the class LuceneDataStoreImpl method documentRemoved.
// Corpus Events
/**
* This method is invoked whenever a document is removed from a corpus
*/
@Override
public void documentRemoved(CorpusEvent ce) {
Object docLRID = ce.getDocumentLRID();
/*
* we need to remove this document from the index
*/
if (docLRID != null) {
ArrayList<Object> removed = new ArrayList<Object>();
removed.add(docLRID);
try {
synchronized (indexer) {
indexer.remove(removed);
}
} catch (IndexException ie) {
throw new GateRuntimeException(ie);
}
// queueForIndexing(docLRID);
}
}
Aggregations