use of gate.Corpus in project gate-core by GateNLP.
the class CorpusPersistence method extractDataFromSource.
/**
* Populates this Persistence with the data that needs to be stored from the
* original source object.
*/
@Override
public void extractDataFromSource(Object source) throws PersistenceException {
// check input
if (!(source instanceof Corpus)) {
throw new UnsupportedOperationException(getClass().getName() + " can only be used for " + Corpus.class.getName() + " objects!\n" + source.getClass().getName() + " is not a " + Corpus.class.getName());
}
Corpus corpus = (Corpus) source;
super.extractDataFromSource(source);
if (dsData == null) {
// transient corpus; we still need to save the docs
docList = new ArrayList<Serializable>();
Iterator<Document> docIter = corpus.iterator();
while (docIter.hasNext()) {
docList.add(PersistenceManager.getPersistentRepresentation(docIter.next()));
}
} else {
// persistent corpus; it takes care of documents by itself
// nothing to do :)
docList = null;
}
}
use of gate.Corpus in project gate-core by GateNLP.
the class TestPersist method testMultipleLrs.
// testSimple()
/**
* Test multiple LRs
*/
public void testMultipleLrs() throws Exception {
// create a temporary directory; because File.createTempFile actually
// writes the bloody thing, we need to delete it from disk before calling
// DataStore.create
File storageDir = File.createTempFile("TestPersist__", "__StorageDir");
storageDir.delete();
// create and open a serial data store
SerialDataStore sds = new SerialDataStore(storageDir.toURI().toURL().toString());
sds.create();
sds.open();
// create a document with some annotations / features on it
String server = TestDocument.getTestServerName();
Document doc = Factory.newDocument(new URL(server + "tests/doc0.html"));
doc.getFeatures().put("hi there", new Integer(23232));
doc.getAnnotations().add(new Long(5), new Long(25), "ThingyMaJig", Factory.newFeatureMap());
// create another document with some annotations / features on it
Document doc2 = Factory.newDocument(new URL(server + "tests/html/test1.htm"));
doc.getFeatures().put("hi there again", new Integer(23232));
doc.getAnnotations().add(new Long(5), new Long(25), "dog poo irritates", Factory.newFeatureMap());
// create a corpus with the documents
Corpus corp = Factory.newCorpus("Hamish test corpus");
corp.add(doc);
corp.add(doc2);
LanguageResource persCorpus = sds.adopt(corp);
sds.sync(persCorpus);
// read the documents back
List<Resource> lrsFromDisk = new ArrayList<Resource>();
List<String> lrIds = sds.getLrIds("gate.corpora.SerialCorpusImpl");
Iterator<String> idsIter = lrIds.iterator();
while (idsIter.hasNext()) {
String lrId = idsIter.next();
FeatureMap features = Factory.newFeatureMap();
features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
features.put(DataStore.LR_ID_FEATURE_NAME, lrId);
Resource lr = Factory.createResource("gate.corpora.SerialCorpusImpl", features);
lrsFromDisk.add(lr);
}
if (DEBUG)
System.out.println("LRs on disk" + lrsFromDisk);
// check that the versions we read back match the originals
Corpus diskCorp = (Corpus) lrsFromDisk.get(0);
Document diskDoc = diskCorp.get(0);
if (DEBUG)
Out.prln("Documents in corpus: " + corp.getDocumentNames());
assertTrue("corp name != mem name", corp.getName().equals(diskCorp.getName()));
if (DEBUG)
Out.prln("Memory features " + corp.getFeatures());
if (DEBUG)
Out.prln("Disk features " + diskCorp.getFeatures());
assertTrue("corp feat != mem feat", corp.getFeatures().equals(diskCorp.getFeatures()));
if (DEBUG)
Out.prln("Annotations in doc: " + diskDoc.getAnnotations());
assertTrue("doc annotations from disk not equal to memory version", TestEqual.annotationSetsEqual(doc.getAnnotations(), diskDoc.getAnnotations()));
assertTrue("doc from disk not equal to memory version", TestEqual.documentsEqual(doc, diskDoc));
Iterator<Document> corpusIter = diskCorp.iterator();
while (corpusIter.hasNext()) {
if (DEBUG)
Out.prln(corpusIter.next().getName());
else
corpusIter.next();
}
// assertTrue("doc2 from disk not equal to memory version",
// doc2.equals(diskDoc2));
// delete the datastore
sds.delete();
}
use of gate.Corpus in project gate-core by GateNLP.
the class SerialCorpusImpl method setTransientSource.
public void setTransientSource(Object source) {
if (!(source instanceof Corpus))
return;
// are restored.
if (this.dataStore != null && this.lrPersistentId != null)
return;
Corpus tCorpus = (Corpus) source;
// copy the corpus name and features from the one in memory
this.setName(tCorpus.getName());
this.setFeatures(tCorpus.getFeatures());
docDataList = new ArrayList<DocumentData>();
// now cache the names of all docs for future use
List<String> docNames = tCorpus.getDocumentNames();
for (int i = 0; i < docNames.size(); i++) {
Document aDoc = tCorpus.get(i);
docDataList.add(new DocumentData(docNames.get(i), null, aDoc.getClass().getName()));
}
// copy all the documents from the transient corpus
documents = new ArrayList<Document>();
documents.addAll(tCorpus);
this.addedDocs = new Vector<Document>();
this.removedDocIDs = new Vector<String>();
this.changedDocs = new Vector<Document>();
// make sure we fire events when docs are added/removed/etc
Gate.getCreoleRegister().addCreoleListener(this);
}
use of gate.Corpus in project gate-core by GateNLP.
the class SerialDataStore method sync.
// close()
/**
* Save: synchonise the in-memory image of the LR with the persistent
* image.
*/
@Override
public void sync(LanguageResource lr) throws PersistenceException {
// check that this LR is one of ours (i.e. has been adopted)
if (lr.getDataStore() == null || !lr.getDataStore().equals(this))
throw new PersistenceException("LR " + lr.getName() + " has not been adopted by this DataStore");
// find the resource data for this LR
ResourceData lrData = Gate.getCreoleRegister().get(lr.getClass().getName());
// create a subdirectory for resources of this type if none exists
File resourceTypeDirectory = new File(storageDir, lrData.getClassName());
if ((!resourceTypeDirectory.exists()) || (!resourceTypeDirectory.isDirectory())) {
// create the directory in the meantime
if (!resourceTypeDirectory.mkdir() && !resourceTypeDirectory.exists())
throw new PersistenceException("Can't write " + resourceTypeDirectory);
}
// create an indentifier for this resource
String lrName = null;
Object lrPersistenceId = null;
lrName = lr.getName();
lrPersistenceId = lr.getLRPersistenceId();
if (lrName == null)
lrName = lrData.getName();
if (lrPersistenceId == null) {
lrPersistenceId = constructPersistenceId(lrName);
lr.setLRPersistenceId(lrPersistenceId);
}
// we're saving a corpus. I need to save its documents first
if (lr instanceof Corpus) {
// check if the corpus is the one we support. CorpusImpl cannot be saved!
if (!(lr instanceof SerialCorpusImpl))
throw new PersistenceException("Can't save a corpus which " + "is not of type SerialCorpusImpl!");
SerialCorpusImpl corpus = (SerialCorpusImpl) lr;
// corresponding document IDs
for (int i = 0; i < corpus.size(); i++) {
// if the document is not in memory, there's little point in saving it
if ((!corpus.isDocumentLoaded(i)) && corpus.isPersistentDocument(i))
continue;
if (DEBUG)
Out.prln("Saving document at position " + i);
if (DEBUG)
Out.prln("Document in memory " + corpus.isDocumentLoaded(i));
if (DEBUG)
Out.prln("is persistent? " + corpus.isPersistentDocument(i));
if (DEBUG)
Out.prln("Document name at position" + corpus.getDocumentName(i));
Document doc = corpus.get(i);
try {
// if the document is not already adopted, we need to do that first
if (doc.getLRPersistenceId() == null) {
if (DEBUG)
Out.prln("Document adopted" + doc.getName());
doc = (Document) this.adopt(doc);
this.sync(doc);
if (DEBUG)
Out.prln("Document sync-ed");
corpus.setDocumentPersistentID(i, doc.getLRPersistenceId());
} else {
// if it is adopted, just sync it
this.sync(doc);
if (DEBUG)
Out.prln("Document sync-ed");
}
// store the persistent ID. Needs to be done even if the document was
// already adopted, in case the doc was already persistent
// when added to the corpus
corpus.setDocumentPersistentID(i, doc.getLRPersistenceId());
if (DEBUG)
Out.prln("new document ID " + doc.getLRPersistenceId());
} catch (Exception ex) {
throw new PersistenceException("Error while saving corpus: " + corpus + "because of an error storing document " + ex.getMessage(), ex);
}
}
// for loop through documents
}
// create a File to store the resource in
File resourceFile = new File(resourceTypeDirectory, (String) lrPersistenceId);
// dump the LR into the new File
try {
OutputStream os = new FileOutputStream(resourceFile);
// after 1.1 the serialised files are compressed
if (!currentProtocolVersion.equals("1.0"))
os = new GZIPOutputStream(os);
os = new BufferedOutputStream(os);
ObjectOutputStream oos = new ObjectOutputStream(os);
oos.writeObject(lr);
oos.close();
} catch (IOException e) {
throw new PersistenceException("Couldn't write to storage file: " + e.getMessage(), e);
}
// let the world know about it
fireResourceWritten(new DatastoreEvent(this, DatastoreEvent.RESOURCE_WRITTEN, lr, lrPersistenceId));
}
use of gate.Corpus in project gate-core by GateNLP.
the class DynamicRegistrationTest method testDynamicRegistration.
public void testDynamicRegistration() throws Exception {
Gate.getCreoleRegister().registerPlugin(new Plugin.Component(TestResource.class));
SerialAnalyserController controller = (SerialAnalyserController) Factory.createResource("gate.creole.SerialAnalyserController", Factory.newFeatureMap(), Factory.newFeatureMap(), "basicRun");
ProcessingResource testResource = (ProcessingResource) Factory.createResource(TestResource.class.getName());
controller.add(testResource);
Corpus corpus = Factory.newCorpus("basicTestCorpus");
String engText = "This is the cereal shot from gnus.";
Document doc = Factory.newDocument(engText);
corpus.add(doc);
controller.setCorpus(corpus);
controller.setDocument(doc);
controller.execute();
}
Aggregations