Search in sources :

Example 6 with Corpus

use of gate.Corpus in project gate-core by GateNLP.

the class CorpusPersistence method extractDataFromSource.

/**
 * Populates this Persistence with the data that needs to be stored from the
 * original source object.
 */
@Override
public void extractDataFromSource(Object source) throws PersistenceException {
    // check input
    if (!(source instanceof Corpus)) {
        throw new UnsupportedOperationException(getClass().getName() + " can only be used for " + Corpus.class.getName() + " objects!\n" + source.getClass().getName() + " is not a " + Corpus.class.getName());
    }
    Corpus corpus = (Corpus) source;
    super.extractDataFromSource(source);
    if (dsData == null) {
        // transient corpus; we still need to save the docs
        docList = new ArrayList<Serializable>();
        Iterator<Document> docIter = corpus.iterator();
        while (docIter.hasNext()) {
            docList.add(PersistenceManager.getPersistentRepresentation(docIter.next()));
        }
    } else {
        // persistent corpus; it takes care of documents by itself
        // nothing to do :)
        docList = null;
    }
}
Also used : Serializable(java.io.Serializable) Document(gate.Document) Corpus(gate.Corpus)

Example 7 with Corpus

use of gate.Corpus in project gate-core by GateNLP.

the class TestPersist method testMultipleLrs.

// testSimple()
/**
 * Test multiple LRs
 */
public void testMultipleLrs() throws Exception {
    // create a temporary directory; because File.createTempFile actually
    // writes the bloody thing, we need to delete it from disk before calling
    // DataStore.create
    File storageDir = File.createTempFile("TestPersist__", "__StorageDir");
    storageDir.delete();
    // create and open a serial data store
    SerialDataStore sds = new SerialDataStore(storageDir.toURI().toURL().toString());
    sds.create();
    sds.open();
    // create a document with some annotations / features on it
    String server = TestDocument.getTestServerName();
    Document doc = Factory.newDocument(new URL(server + "tests/doc0.html"));
    doc.getFeatures().put("hi there", new Integer(23232));
    doc.getAnnotations().add(new Long(5), new Long(25), "ThingyMaJig", Factory.newFeatureMap());
    // create another document with some annotations / features on it
    Document doc2 = Factory.newDocument(new URL(server + "tests/html/test1.htm"));
    doc.getFeatures().put("hi there again", new Integer(23232));
    doc.getAnnotations().add(new Long(5), new Long(25), "dog poo irritates", Factory.newFeatureMap());
    // create a corpus with the documents
    Corpus corp = Factory.newCorpus("Hamish test corpus");
    corp.add(doc);
    corp.add(doc2);
    LanguageResource persCorpus = sds.adopt(corp);
    sds.sync(persCorpus);
    // read the documents back
    List<Resource> lrsFromDisk = new ArrayList<Resource>();
    List<String> lrIds = sds.getLrIds("gate.corpora.SerialCorpusImpl");
    Iterator<String> idsIter = lrIds.iterator();
    while (idsIter.hasNext()) {
        String lrId = idsIter.next();
        FeatureMap features = Factory.newFeatureMap();
        features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
        features.put(DataStore.LR_ID_FEATURE_NAME, lrId);
        Resource lr = Factory.createResource("gate.corpora.SerialCorpusImpl", features);
        lrsFromDisk.add(lr);
    }
    if (DEBUG)
        System.out.println("LRs on disk" + lrsFromDisk);
    // check that the versions we read back match the originals
    Corpus diskCorp = (Corpus) lrsFromDisk.get(0);
    Document diskDoc = diskCorp.get(0);
    if (DEBUG)
        Out.prln("Documents in corpus: " + corp.getDocumentNames());
    assertTrue("corp name != mem name", corp.getName().equals(diskCorp.getName()));
    if (DEBUG)
        Out.prln("Memory features " + corp.getFeatures());
    if (DEBUG)
        Out.prln("Disk features " + diskCorp.getFeatures());
    assertTrue("corp feat != mem feat", corp.getFeatures().equals(diskCorp.getFeatures()));
    if (DEBUG)
        Out.prln("Annotations in doc: " + diskDoc.getAnnotations());
    assertTrue("doc annotations from disk not equal to memory version", TestEqual.annotationSetsEqual(doc.getAnnotations(), diskDoc.getAnnotations()));
    assertTrue("doc from disk not equal to memory version", TestEqual.documentsEqual(doc, diskDoc));
    Iterator<Document> corpusIter = diskCorp.iterator();
    while (corpusIter.hasNext()) {
        if (DEBUG)
            Out.prln(corpusIter.next().getName());
        else
            corpusIter.next();
    }
    // assertTrue("doc2 from disk not equal to memory version",
    // doc2.equals(diskDoc2));
    // delete the datastore
    sds.delete();
}
Also used : LanguageResource(gate.LanguageResource) Resource(gate.Resource) LanguageResource(gate.LanguageResource) ArrayList(java.util.ArrayList) TestDocument(gate.corpora.TestDocument) Document(gate.Document) URL(java.net.URL) Corpus(gate.Corpus) FeatureMap(gate.FeatureMap) File(java.io.File)

Example 8 with Corpus

use of gate.Corpus in project gate-core by GateNLP.

the class SerialCorpusImpl method setTransientSource.

public void setTransientSource(Object source) {
    if (!(source instanceof Corpus))
        return;
    // are restored.
    if (this.dataStore != null && this.lrPersistentId != null)
        return;
    Corpus tCorpus = (Corpus) source;
    // copy the corpus name and features from the one in memory
    this.setName(tCorpus.getName());
    this.setFeatures(tCorpus.getFeatures());
    docDataList = new ArrayList<DocumentData>();
    // now cache the names of all docs for future use
    List<String> docNames = tCorpus.getDocumentNames();
    for (int i = 0; i < docNames.size(); i++) {
        Document aDoc = tCorpus.get(i);
        docDataList.add(new DocumentData(docNames.get(i), null, aDoc.getClass().getName()));
    }
    // copy all the documents from the transient corpus
    documents = new ArrayList<Document>();
    documents.addAll(tCorpus);
    this.addedDocs = new Vector<Document>();
    this.removedDocIDs = new Vector<String>();
    this.changedDocs = new Vector<Document>();
    // make sure we fire events when docs are added/removed/etc
    Gate.getCreoleRegister().addCreoleListener(this);
}
Also used : Document(gate.Document) IndexedCorpus(gate.creole.ir.IndexedCorpus) Corpus(gate.Corpus)

Example 9 with Corpus

use of gate.Corpus in project gate-core by GateNLP.

the class SerialDataStore method sync.

// close()
/**
 * Save: synchonise the in-memory image of the LR with the persistent
 * image.
 */
@Override
public void sync(LanguageResource lr) throws PersistenceException {
    // check that this LR is one of ours (i.e. has been adopted)
    if (lr.getDataStore() == null || !lr.getDataStore().equals(this))
        throw new PersistenceException("LR " + lr.getName() + " has not been adopted by this DataStore");
    // find the resource data for this LR
    ResourceData lrData = Gate.getCreoleRegister().get(lr.getClass().getName());
    // create a subdirectory for resources of this type if none exists
    File resourceTypeDirectory = new File(storageDir, lrData.getClassName());
    if ((!resourceTypeDirectory.exists()) || (!resourceTypeDirectory.isDirectory())) {
        // create the directory in the meantime
        if (!resourceTypeDirectory.mkdir() && !resourceTypeDirectory.exists())
            throw new PersistenceException("Can't write " + resourceTypeDirectory);
    }
    // create an indentifier for this resource
    String lrName = null;
    Object lrPersistenceId = null;
    lrName = lr.getName();
    lrPersistenceId = lr.getLRPersistenceId();
    if (lrName == null)
        lrName = lrData.getName();
    if (lrPersistenceId == null) {
        lrPersistenceId = constructPersistenceId(lrName);
        lr.setLRPersistenceId(lrPersistenceId);
    }
    // we're saving a corpus. I need to save its documents first
    if (lr instanceof Corpus) {
        // check if the corpus is the one we support. CorpusImpl cannot be saved!
        if (!(lr instanceof SerialCorpusImpl))
            throw new PersistenceException("Can't save a corpus which " + "is not of type SerialCorpusImpl!");
        SerialCorpusImpl corpus = (SerialCorpusImpl) lr;
        // corresponding document IDs
        for (int i = 0; i < corpus.size(); i++) {
            // if the document is not in memory, there's little point in saving it
            if ((!corpus.isDocumentLoaded(i)) && corpus.isPersistentDocument(i))
                continue;
            if (DEBUG)
                Out.prln("Saving document at position " + i);
            if (DEBUG)
                Out.prln("Document in memory " + corpus.isDocumentLoaded(i));
            if (DEBUG)
                Out.prln("is persistent? " + corpus.isPersistentDocument(i));
            if (DEBUG)
                Out.prln("Document name at position" + corpus.getDocumentName(i));
            Document doc = corpus.get(i);
            try {
                // if the document is not already adopted, we need to do that first
                if (doc.getLRPersistenceId() == null) {
                    if (DEBUG)
                        Out.prln("Document adopted" + doc.getName());
                    doc = (Document) this.adopt(doc);
                    this.sync(doc);
                    if (DEBUG)
                        Out.prln("Document sync-ed");
                    corpus.setDocumentPersistentID(i, doc.getLRPersistenceId());
                } else {
                    // if it is adopted, just sync it
                    this.sync(doc);
                    if (DEBUG)
                        Out.prln("Document sync-ed");
                }
                // store the persistent ID. Needs to be done even if the document was
                // already adopted, in case the doc was already persistent
                // when added to the corpus
                corpus.setDocumentPersistentID(i, doc.getLRPersistenceId());
                if (DEBUG)
                    Out.prln("new document ID " + doc.getLRPersistenceId());
            } catch (Exception ex) {
                throw new PersistenceException("Error while saving corpus: " + corpus + "because of an error storing document " + ex.getMessage(), ex);
            }
        }
    // for loop through documents
    }
    // create a File to store the resource in
    File resourceFile = new File(resourceTypeDirectory, (String) lrPersistenceId);
    // dump the LR into the new File
    try {
        OutputStream os = new FileOutputStream(resourceFile);
        // after 1.1 the serialised files are compressed
        if (!currentProtocolVersion.equals("1.0"))
            os = new GZIPOutputStream(os);
        os = new BufferedOutputStream(os);
        ObjectOutputStream oos = new ObjectOutputStream(os);
        oos.writeObject(lr);
        oos.close();
    } catch (IOException e) {
        throw new PersistenceException("Couldn't write to storage file: " + e.getMessage(), e);
    }
    // let the world know about it
    fireResourceWritten(new DatastoreEvent(this, DatastoreEvent.RESOURCE_WRITTEN, lr, lrPersistenceId));
}
Also used : ResourceData(gate.creole.ResourceData) BufferedOutputStream(java.io.BufferedOutputStream) ObjectOutputStream(java.io.ObjectOutputStream) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) GZIPOutputStream(java.util.zip.GZIPOutputStream) IOException(java.io.IOException) Document(gate.Document) ObjectOutputStream(java.io.ObjectOutputStream) Corpus(gate.Corpus) URISyntaxException(java.net.URISyntaxException) GateRuntimeException(gate.util.GateRuntimeException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) GZIPOutputStream(java.util.zip.GZIPOutputStream) SerialCorpusImpl(gate.corpora.SerialCorpusImpl) FileOutputStream(java.io.FileOutputStream) DatastoreEvent(gate.event.DatastoreEvent) File(java.io.File) BufferedOutputStream(java.io.BufferedOutputStream)

Example 10 with Corpus

use of gate.Corpus in project gate-core by GateNLP.

the class DynamicRegistrationTest method testDynamicRegistration.

public void testDynamicRegistration() throws Exception {
    Gate.getCreoleRegister().registerPlugin(new Plugin.Component(TestResource.class));
    SerialAnalyserController controller = (SerialAnalyserController) Factory.createResource("gate.creole.SerialAnalyserController", Factory.newFeatureMap(), Factory.newFeatureMap(), "basicRun");
    ProcessingResource testResource = (ProcessingResource) Factory.createResource(TestResource.class.getName());
    controller.add(testResource);
    Corpus corpus = Factory.newCorpus("basicTestCorpus");
    String engText = "This is the cereal shot from gnus.";
    Document doc = Factory.newDocument(engText);
    corpus.add(doc);
    controller.setCorpus(corpus);
    controller.setDocument(doc);
    controller.execute();
}
Also used : ProcessingResource(gate.ProcessingResource) Document(gate.Document) Corpus(gate.Corpus)

Aggregations

Corpus (gate.Corpus)15 Document (gate.Document)7 CorpusController (gate.CorpusController)4 LanguageResource (gate.LanguageResource)4 ProcessingResource (gate.ProcessingResource)4 GateRuntimeException (gate.util.GateRuntimeException)4 FeatureMap (gate.FeatureMap)3 Resource (gate.Resource)3 File (java.io.File)3 AbstractVisualResource (gate.creole.AbstractVisualResource)2 ResourceData (gate.creole.ResourceData)2 IndexedCorpus (gate.creole.ir.IndexedCorpus)2 CreoleResource (gate.creole.metadata.CreoleResource)2 DatastoreEvent (gate.event.DatastoreEvent)2 IOException (java.io.IOException)2 URISyntaxException (java.net.URISyntaxException)2 URL (java.net.URL)2 EventObject (java.util.EventObject)2 Controller (gate.Controller)1 CorpusExporter (gate.CorpusExporter)1