Search in sources :

Example 21 with Document

use of gate.Document in project gate-core by GateNLP.

the class CorpusAnnotationDiff method init.

/**
 * This method does the diff, Precision,Recall,FalsePositive
 * calculation and so on.
 */
@Override
public Resource init() throws ResourceInstantiationException {
    colors[DEFAULT_TYPE] = WHITE;
    colors[CORRECT_TYPE] = GREEN;
    colors[SPURIOUS_TYPE] = RED;
    colors[PARTIALLY_CORRECT_TYPE] = BLUE;
    colors[MISSING_TYPE] = YELLOW;
    // Initialize the partially sets...
    keyPartiallySet = new HashSet<Annotation>();
    responsePartiallySet = new HashSet<Annotation>();
    // Do the diff, P&R calculation and so on
    AnnotationSet keyAnnotSet = null;
    AnnotationSet responseAnnotSet = null;
    if (annotationSchema == null)
        throw new ResourceInstantiationException("No annotation schema defined !");
    if (keyCorpus == null || 0 == keyCorpus.size())
        throw new ResourceInstantiationException("No key corpus or empty defined !");
    if (responseCorpus == null || 0 == responseCorpus.size())
        throw new ResourceInstantiationException("No response corpus or empty defined !");
    // init counters and do difference for documents by pairs
    for (int type = 0; type < MAX_TYPES; type++) typeCounter[type] = 0;
    diffSet = new HashSet<DiffSetElement>();
    for (int i = 0; i < keyCorpus.size(); ++i) {
        keyDocument = keyCorpus.get(i);
        // find corresponding responce document if any
        Document doc;
        responseDocument = null;
        for (int j = 0; j < responseCorpus.size(); ++j) {
            doc = responseCorpus.get(j);
            if (0 == doc.getName().compareTo(keyDocument.getName()) || 0 == doc.getSourceUrl().getFile().compareTo(keyDocument.getSourceUrl().getFile())) {
                responseDocument = doc;
                // response corpus loop
                break;
            }
        // if
        }
        if (null == responseDocument) {
            Out.prln("There is no mach in responce corpus for document '" + keyDocument.getName() + "' from key corpus");
            // key corpus loop
            continue;
        }
        if (keyAnnotationSetName == null) {
            // Get the default key AnnotationSet from the keyDocument
            keyAnnotSet = keyDocument.getAnnotations().get(annotationSchema.getAnnotationName());
        } else {
            keyAnnotSet = keyDocument.getAnnotations(keyAnnotationSetName).get(annotationSchema.getAnnotationName());
        }
        if (keyAnnotSet == null)
            // The diff will run with an empty set.All annotations from response
            // would be spurious.
            keyAnnotList = new LinkedList<Annotation>();
        else
            // The alghoritm will modify this annotation set. It is better to make a
            // separate copy of them.
            keyAnnotList = new LinkedList<Annotation>(keyAnnotSet);
        if (responseAnnotationSetName == null)
            // Get the response AnnotationSet from the default set
            responseAnnotSet = responseDocument.getAnnotations().get(annotationSchema.getAnnotationName());
        else
            responseAnnotSet = responseDocument.getAnnotations(responseAnnotationSetName).get(annotationSchema.getAnnotationName());
        if (responseAnnotSet == null)
            // The diff will run with an empty set.All annotations from key
            // would be missing.
            responseAnnotList = new LinkedList<Annotation>();
        else
            // The alghoritm will modify this annotation set. It is better to make a
            // separate copy of them.
            responseAnnotList = new LinkedList<Annotation>(responseAnnotSet);
        // Sort them ascending on Start offset (the comparator does that)
        AnnotationSetComparator asComparator = new AnnotationSetComparator();
        Collections.sort(keyAnnotList, asComparator);
        Collections.sort(responseAnnotList, asComparator);
        // Calculate the diff Set. This set will be used later with graphic
        // visualisation.
        doDiff(keyAnnotList, responseAnnotList);
    }
    // If it runs under text mode just stop here.
    if (textMode)
        return this;
    // Show it
    // Configuring the formatter object. It will be used later to format
    // precision and recall
    formatter.setMaximumIntegerDigits(1);
    formatter.setMinimumFractionDigits(4);
    formatter.setMinimumFractionDigits(4);
    // Create an Annotation diff table model
    AnnotationDiffTableModel diffModel = new AnnotationDiffTableModel(diffSet);
    // Create a XJTable based on this model
    diffTable = new XJTable(diffModel);
    diffTable.setAlignmentX(Component.LEFT_ALIGNMENT);
    // Set the cell renderer for this table.
    AnnotationDiffCellRenderer cellRenderer = new AnnotationDiffCellRenderer();
    diffTable.setDefaultRenderer(java.lang.String.class, cellRenderer);
    diffTable.setDefaultRenderer(java.lang.Long.class, cellRenderer);
    // Put the table into a JScroll
    // Arange all components on a this JPanel
    SwingUtilities.invokeLater(new Runnable() {

        @Override
        public void run() {
            arangeAllComponents();
        }
    });
    if (DEBUG)
        printStructure(diffSet);
    return this;
}
Also used : XJTable(gate.swing.XJTable) AnnotationSet(gate.AnnotationSet) Document(gate.Document) Annotation(gate.Annotation) LinkedList(java.util.LinkedList) ResourceInstantiationException(gate.creole.ResourceInstantiationException)

Example 22 with Document

use of gate.Document in project gate-core by GateNLP.

the class CorpusImpl method populate.

/**
 * Fills the provided corpus with documents extracted from the
 * provided trec file.
 *
 * @param corpus the corpus to be populated.
 * @param singleConcatenatedFile the trec file.
 * @param documentRootElement text between this element (start and
 *          end) is considered for creating a new document.
 * @param encoding the encoding of the trec file.
 * @param numberOfDocumentsToExtract extracts the specified number of
 *          documents from the trecweb file; -1 to indicate all files.
 * @param mimeType the mime type which determines how the document is handled
 * @return total length of populated documents in the corpus in number
 *         of bytes
 * @throws java.io.IOException
 */
public static long populate(Corpus corpus, URL singleConcatenatedFile, String documentRootElement, String encoding, int numberOfDocumentsToExtract, String documentNamePrefix, String mimeType, boolean includeRootElement) throws IOException {
    StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
    // obtain the root element that user has provided
    // content between the start and end of root element is considered
    // for creating documents
    documentRootElement = documentRootElement.toLowerCase();
    // document name prefix could be an empty string
    documentNamePrefix = documentNamePrefix == null ? "" : documentNamePrefix.trim() + "_";
    // we start a new document when we find <documentRootElement> and
    // close it when we find </documentRootElement>
    BufferedReader br = null;
    try {
        if (encoding != null && encoding.trim().length() != 0) {
            br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), encoding, 10485760);
        } else {
            br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), 10485760);
        }
        // reading line by line
        String line = br.readLine();
        // this is where we store document content
        StringBuilder documentString = new StringBuilder();
        // toggle switch to indicate search for start element
        boolean searchingForStartElement = true;
        // keeping count of number of documents extracted
        int count = 1;
        // length in bytes read so far (to return)
        long lengthInBytes = 0;
        // continue until reached the end of file
        while (line != null) {
            // lowercase the line in order to match documentRootElement in any case
            String lowerCasedLine = line.toLowerCase();
            // if searching for startElement?
            if (searchingForStartElement) {
                // may be its with attributes
                int index = lowerCasedLine.indexOf("<" + documentRootElement + " ");
                // may be no attributes?
                if (index == -1) {
                    index = lowerCasedLine.indexOf("<" + documentRootElement + ">");
                }
                // skip the current line and start reading from the next line
                if (index != -1) {
                    // if found, that's the first line
                    line = line.substring(index);
                    searchingForStartElement = false;
                } else {
                    line = br.readLine();
                }
            } else {
                // now searching for last element
                int index = lowerCasedLine.indexOf("</" + documentRootElement + ">");
                // if not found.. this is the content of a new document
                if (index == -1) {
                    documentString.append(line + "\n");
                    line = br.readLine();
                } else {
                    // found.. then end the document
                    documentString.append(line.substring(0, index + documentRootElement.length() + 3));
                    // getting ready for the next document
                    searchingForStartElement = true;
                    // here lets create a new document create the doc
                    if (sListener != null)
                        sListener.statusChanged("Creating Document Number :" + count);
                    String docName = documentNamePrefix + count + "_" + Gate.genSym();
                    String docContent = documentString.toString();
                    if (!includeRootElement)
                        docContent = docContent.substring(docContent.indexOf(">") + 1, docContent.lastIndexOf("<"));
                    FeatureMap params = Factory.newFeatureMap();
                    if (mimeType != null)
                        params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
                    params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, docContent);
                    if (encoding != null && encoding.trim().length() > 0)
                        params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
                    // calculate the length
                    lengthInBytes += docContent.getBytes().length;
                    try {
                        Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
                        count++;
                        corpus.add(doc);
                        if (corpus.getLRPersistenceId() != null) {
                            // persistent corpus -> unload the document
                            corpus.unloadDocument(doc);
                            Factory.deleteResource(doc);
                        }
                        // already extracted requested num of documents?
                        if ((count - 1) == numberOfDocumentsToExtract)
                            break;
                    } catch (Throwable t) {
                        String nl = Strings.getNl();
                        Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + "  Document name was: " + docName + nl + "  Exception was: " + t + nl + nl);
                        t.printStackTrace();
                    }
                    documentString = new StringBuilder();
                    if (sListener != null)
                        sListener.statusChanged(docName + " created!");
                    line = line.substring(index + documentRootElement.length() + 3);
                    if (line.trim().equals(""))
                        line = br.readLine();
                }
            }
        }
        return lengthInBytes;
    } finally {
        if (br != null)
            br.close();
    }
}
Also used : FeatureMap(gate.FeatureMap) BomStrippingInputStreamReader(gate.util.BomStrippingInputStreamReader) BufferedReader(java.io.BufferedReader) StatusListener(gate.event.StatusListener) Document(gate.Document)

Example 23 with Document

use of gate.Document in project gate-core by GateNLP.

the class CorpusImpl method populate.

/**
 * Fills the provided corpus with documents created on the fly from
 * selected files in a directory. Uses a {@link FileFilter} to select
 * which files will be used and which will be ignored. A simple file
 * filter based on extensions is provided in the Gate distribution (
 * {@link gate.util.ExtensionFileFilter}).
 *
 * @param corpus the corpus to be populated
 * @param directory the directory from which the files will be picked.
 *          This parameter is an URL for uniformity. It needs to be a
 *          URL of type file otherwise an InvalidArgumentException
 *          will be thrown.
 * @param filter the file filter used to select files from the target
 *          directory. If the filter is <tt>null</tt> all the files
 *          will be accepted.
 * @param encoding the encoding to be used for reading the documents
 * @param recurseDirectories should the directory be parsed
 *          recursively?. If <tt>true</tt> all the files from the
 *          provided directory and all its children directories (on as
 *          many levels as necessary) will be picked if accepted by
 *          the filter otherwise the children directories will be
 *          ignored.
 * @throws java.io.IOException if a file doesn't exist
 */
public static void populate(Corpus corpus, URL directory, FileFilter filter, String encoding, String mimeType, boolean recurseDirectories) throws IOException {
    // check input
    if (!directory.getProtocol().equalsIgnoreCase("file"))
        throw new IllegalArgumentException("The URL provided is not of type \"file:\"!");
    File dir = Files.fileFromURL(directory);
    if (!dir.exists())
        throw new FileNotFoundException(dir.toString());
    if (!dir.isDirectory())
        throw new IllegalArgumentException(dir.getAbsolutePath() + " is not a directory!");
    File[] files;
    // populate the corpus
    if (recurseDirectories) {
        files = Files.listFilesRecursively(dir, filter);
    } else {
        files = dir.listFiles(filter);
    }
    if (files == null) {
        return;
    }
    // sort the files alphabetically regardless of their paths
    Arrays.sort(files, new Comparator<File>() {

        @Override
        public int compare(File f1, File f2) {
            return f1.getName().compareTo(f2.getName());
        }
    });
    // create the GATE documents
    for (File file : files) {
        if (file.isDirectory()) {
            continue;
        }
        StatusListener sListener = (StatusListener) Gate.getListeners().get("gate.event.StatusListener");
        if (sListener != null)
            sListener.statusChanged("Reading: " + file.getName());
        String docName = file.getName() + "_" + Gate.genSym();
        FeatureMap params = Factory.newFeatureMap();
        params.put(Document.DOCUMENT_URL_PARAMETER_NAME, file.toURI().toURL());
        if (encoding != null)
            params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
        if (mimeType != null)
            params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
        try {
            Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
            corpus.add(doc);
            if (corpus.getLRPersistenceId() != null) {
                // persistent corpus -> unload the document
                corpus.unloadDocument(doc);
                Factory.deleteResource(doc);
            }
        } catch (Throwable t) {
            String nl = Strings.getNl();
            Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + "  Document name was: " + docName + nl + "  Exception was: " + t + nl + nl);
            t.printStackTrace();
        }
        if (sListener != null)
            sListener.statusChanged(file.getName() + " read");
    }
}
Also used : FeatureMap(gate.FeatureMap) FileNotFoundException(java.io.FileNotFoundException) StatusListener(gate.event.StatusListener) Document(gate.Document) File(java.io.File)

Example 24 with Document

use of gate.Document in project gate-core by GateNLP.

the class LuceneDataStoreImpl method sync.

/**
 * Save: synchonise the in-memory image of the LR with the persistent
 * image.
 */
@Override
public void sync(LanguageResource lr) throws PersistenceException {
    if (lr.getLRPersistenceId() != null) {
        // lock the LR ID so we don't write to the file while an
        // indexer task is reading it
        Object lock = lockObjectForID(lr.getLRPersistenceId());
        synchronized (lock) {
            // we load the copy of this LR and check if any modification were done
            // if so, it should be reindexed or else it should not be synced again.
            LanguageResource copy = null;
            try {
                copy = getLr(lr.getClass().getName(), lr.getLRPersistenceId());
                // we check it only if it is an instance of Document
                if (copy instanceof Document && lr instanceof Document) {
                    Document cDoc = (Document) copy;
                    Document lrDoc = (Document) lr;
                    boolean sameDocs = false;
                    // as that's what matters from the annic perspective
                    if (cDoc.getContent().equals(lrDoc.getContent())) {
                        if (cDoc.getAnnotations().equals(lrDoc.getAnnotations())) {
                            if (cDoc.getNamedAnnotationSets().equals(lrDoc.getNamedAnnotationSets())) {
                                boolean allSetsSame = true;
                                for (String key : cDoc.getNamedAnnotationSets().keySet()) {
                                    if (!cDoc.getAnnotations(key).equals(lrDoc.getAnnotations(key))) {
                                        allSetsSame = false;
                                        break;
                                    }
                                }
                                if (allSetsSame) {
                                    sameDocs = true;
                                }
                            }
                        }
                    }
                    if (sameDocs) {
                        lock = null;
                        return;
                    }
                }
            } catch (SecurityException e) {
                e.printStackTrace();
            } finally {
                // delete the copy of this LR
                if (copy != null) {
                    Factory.deleteResource(copy);
                }
            }
            super.sync(lr);
        }
        lock = null;
    } else {
        super.sync(lr);
    }
    if (lr instanceof Document) {
        queueForIndexing(lr.getLRPersistenceId());
    }
}
Also used : LanguageResource(gate.LanguageResource) Document(gate.Document)

Example 25 with Document

use of gate.Document in project gate-core by GateNLP.

the class SerialDataStore method sync.

// close()
/**
 * Save: synchonise the in-memory image of the LR with the persistent
 * image.
 */
@Override
public void sync(LanguageResource lr) throws PersistenceException {
    // check that this LR is one of ours (i.e. has been adopted)
    if (lr.getDataStore() == null || !lr.getDataStore().equals(this))
        throw new PersistenceException("LR " + lr.getName() + " has not been adopted by this DataStore");
    // find the resource data for this LR
    ResourceData lrData = Gate.getCreoleRegister().get(lr.getClass().getName());
    // create a subdirectory for resources of this type if none exists
    File resourceTypeDirectory = new File(storageDir, lrData.getClassName());
    if ((!resourceTypeDirectory.exists()) || (!resourceTypeDirectory.isDirectory())) {
        // create the directory in the meantime
        if (!resourceTypeDirectory.mkdir() && !resourceTypeDirectory.exists())
            throw new PersistenceException("Can't write " + resourceTypeDirectory);
    }
    // create an indentifier for this resource
    String lrName = null;
    Object lrPersistenceId = null;
    lrName = lr.getName();
    lrPersistenceId = lr.getLRPersistenceId();
    if (lrName == null)
        lrName = lrData.getName();
    if (lrPersistenceId == null) {
        lrPersistenceId = constructPersistenceId(lrName);
        lr.setLRPersistenceId(lrPersistenceId);
    }
    // we're saving a corpus. I need to save its documents first
    if (lr instanceof Corpus) {
        // check if the corpus is the one we support. CorpusImpl cannot be saved!
        if (!(lr instanceof SerialCorpusImpl))
            throw new PersistenceException("Can't save a corpus which " + "is not of type SerialCorpusImpl!");
        SerialCorpusImpl corpus = (SerialCorpusImpl) lr;
        // corresponding document IDs
        for (int i = 0; i < corpus.size(); i++) {
            // if the document is not in memory, there's little point in saving it
            if ((!corpus.isDocumentLoaded(i)) && corpus.isPersistentDocument(i))
                continue;
            if (DEBUG)
                Out.prln("Saving document at position " + i);
            if (DEBUG)
                Out.prln("Document in memory " + corpus.isDocumentLoaded(i));
            if (DEBUG)
                Out.prln("is persistent? " + corpus.isPersistentDocument(i));
            if (DEBUG)
                Out.prln("Document name at position" + corpus.getDocumentName(i));
            Document doc = corpus.get(i);
            try {
                // if the document is not already adopted, we need to do that first
                if (doc.getLRPersistenceId() == null) {
                    if (DEBUG)
                        Out.prln("Document adopted" + doc.getName());
                    doc = (Document) this.adopt(doc);
                    this.sync(doc);
                    if (DEBUG)
                        Out.prln("Document sync-ed");
                    corpus.setDocumentPersistentID(i, doc.getLRPersistenceId());
                } else {
                    // if it is adopted, just sync it
                    this.sync(doc);
                    if (DEBUG)
                        Out.prln("Document sync-ed");
                }
                // store the persistent ID. Needs to be done even if the document was
                // already adopted, in case the doc was already persistent
                // when added to the corpus
                corpus.setDocumentPersistentID(i, doc.getLRPersistenceId());
                if (DEBUG)
                    Out.prln("new document ID " + doc.getLRPersistenceId());
            } catch (Exception ex) {
                throw new PersistenceException("Error while saving corpus: " + corpus + "because of an error storing document " + ex.getMessage(), ex);
            }
        }
    // for loop through documents
    }
    // create a File to store the resource in
    File resourceFile = new File(resourceTypeDirectory, (String) lrPersistenceId);
    // dump the LR into the new File
    try {
        OutputStream os = new FileOutputStream(resourceFile);
        // after 1.1 the serialised files are compressed
        if (!currentProtocolVersion.equals("1.0"))
            os = new GZIPOutputStream(os);
        os = new BufferedOutputStream(os);
        ObjectOutputStream oos = new ObjectOutputStream(os);
        oos.writeObject(lr);
        oos.close();
    } catch (IOException e) {
        throw new PersistenceException("Couldn't write to storage file: " + e.getMessage(), e);
    }
    // let the world know about it
    fireResourceWritten(new DatastoreEvent(this, DatastoreEvent.RESOURCE_WRITTEN, lr, lrPersistenceId));
}
Also used : ResourceData(gate.creole.ResourceData) BufferedOutputStream(java.io.BufferedOutputStream) ObjectOutputStream(java.io.ObjectOutputStream) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) GZIPOutputStream(java.util.zip.GZIPOutputStream) IOException(java.io.IOException) Document(gate.Document) ObjectOutputStream(java.io.ObjectOutputStream) Corpus(gate.Corpus) URISyntaxException(java.net.URISyntaxException) GateRuntimeException(gate.util.GateRuntimeException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) GZIPOutputStream(java.util.zip.GZIPOutputStream) SerialCorpusImpl(gate.corpora.SerialCorpusImpl) FileOutputStream(java.io.FileOutputStream) DatastoreEvent(gate.event.DatastoreEvent) File(java.io.File) BufferedOutputStream(java.io.BufferedOutputStream)

Aggregations

Document (gate.Document)47 File (java.io.File)17 FeatureMap (gate.FeatureMap)16 URL (java.net.URL)12 AnnotationSet (gate.AnnotationSet)9 TestDocument (gate.corpora.TestDocument)9 Annotation (gate.Annotation)7 Corpus (gate.Corpus)7 ResourceInstantiationException (gate.creole.ResourceInstantiationException)7 PersistenceException (gate.persist.PersistenceException)6 DataStore (gate.DataStore)5 LanguageResource (gate.LanguageResource)5 ArrayList (java.util.ArrayList)5 HashSet (java.util.HashSet)5 LanguageAnalyser (gate.LanguageAnalyser)4 SerialDataStore (gate.persist.SerialDataStore)4 GateRuntimeException (gate.util.GateRuntimeException)4 ActionEvent (java.awt.event.ActionEvent)4 List (java.util.List)4 AbstractAction (javax.swing.AbstractAction)4