use of gate.Document in project gate-core by GateNLP.
the class CorpusAnnotationDiff method init.
/**
* This method does the diff, Precision,Recall,FalsePositive
* calculation and so on.
*/
@Override
public Resource init() throws ResourceInstantiationException {
colors[DEFAULT_TYPE] = WHITE;
colors[CORRECT_TYPE] = GREEN;
colors[SPURIOUS_TYPE] = RED;
colors[PARTIALLY_CORRECT_TYPE] = BLUE;
colors[MISSING_TYPE] = YELLOW;
// Initialize the partially sets...
keyPartiallySet = new HashSet<Annotation>();
responsePartiallySet = new HashSet<Annotation>();
// Do the diff, P&R calculation and so on
AnnotationSet keyAnnotSet = null;
AnnotationSet responseAnnotSet = null;
if (annotationSchema == null)
throw new ResourceInstantiationException("No annotation schema defined !");
if (keyCorpus == null || 0 == keyCorpus.size())
throw new ResourceInstantiationException("No key corpus or empty defined !");
if (responseCorpus == null || 0 == responseCorpus.size())
throw new ResourceInstantiationException("No response corpus or empty defined !");
// init counters and do difference for documents by pairs
for (int type = 0; type < MAX_TYPES; type++) typeCounter[type] = 0;
diffSet = new HashSet<DiffSetElement>();
for (int i = 0; i < keyCorpus.size(); ++i) {
keyDocument = keyCorpus.get(i);
// find corresponding responce document if any
Document doc;
responseDocument = null;
for (int j = 0; j < responseCorpus.size(); ++j) {
doc = responseCorpus.get(j);
if (0 == doc.getName().compareTo(keyDocument.getName()) || 0 == doc.getSourceUrl().getFile().compareTo(keyDocument.getSourceUrl().getFile())) {
responseDocument = doc;
// response corpus loop
break;
}
// if
}
if (null == responseDocument) {
Out.prln("There is no mach in responce corpus for document '" + keyDocument.getName() + "' from key corpus");
// key corpus loop
continue;
}
if (keyAnnotationSetName == null) {
// Get the default key AnnotationSet from the keyDocument
keyAnnotSet = keyDocument.getAnnotations().get(annotationSchema.getAnnotationName());
} else {
keyAnnotSet = keyDocument.getAnnotations(keyAnnotationSetName).get(annotationSchema.getAnnotationName());
}
if (keyAnnotSet == null)
// The diff will run with an empty set.All annotations from response
// would be spurious.
keyAnnotList = new LinkedList<Annotation>();
else
// The alghoritm will modify this annotation set. It is better to make a
// separate copy of them.
keyAnnotList = new LinkedList<Annotation>(keyAnnotSet);
if (responseAnnotationSetName == null)
// Get the response AnnotationSet from the default set
responseAnnotSet = responseDocument.getAnnotations().get(annotationSchema.getAnnotationName());
else
responseAnnotSet = responseDocument.getAnnotations(responseAnnotationSetName).get(annotationSchema.getAnnotationName());
if (responseAnnotSet == null)
// The diff will run with an empty set.All annotations from key
// would be missing.
responseAnnotList = new LinkedList<Annotation>();
else
// The alghoritm will modify this annotation set. It is better to make a
// separate copy of them.
responseAnnotList = new LinkedList<Annotation>(responseAnnotSet);
// Sort them ascending on Start offset (the comparator does that)
AnnotationSetComparator asComparator = new AnnotationSetComparator();
Collections.sort(keyAnnotList, asComparator);
Collections.sort(responseAnnotList, asComparator);
// Calculate the diff Set. This set will be used later with graphic
// visualisation.
doDiff(keyAnnotList, responseAnnotList);
}
// If it runs under text mode just stop here.
if (textMode)
return this;
// Show it
// Configuring the formatter object. It will be used later to format
// precision and recall
formatter.setMaximumIntegerDigits(1);
formatter.setMinimumFractionDigits(4);
formatter.setMinimumFractionDigits(4);
// Create an Annotation diff table model
AnnotationDiffTableModel diffModel = new AnnotationDiffTableModel(diffSet);
// Create a XJTable based on this model
diffTable = new XJTable(diffModel);
diffTable.setAlignmentX(Component.LEFT_ALIGNMENT);
// Set the cell renderer for this table.
AnnotationDiffCellRenderer cellRenderer = new AnnotationDiffCellRenderer();
diffTable.setDefaultRenderer(java.lang.String.class, cellRenderer);
diffTable.setDefaultRenderer(java.lang.Long.class, cellRenderer);
// Put the table into a JScroll
// Arange all components on a this JPanel
SwingUtilities.invokeLater(new Runnable() {
@Override
public void run() {
arangeAllComponents();
}
});
if (DEBUG)
printStructure(diffSet);
return this;
}
use of gate.Document in project gate-core by GateNLP.
the class CorpusImpl method populate.
/**
* Fills the provided corpus with documents extracted from the
* provided trec file.
*
* @param corpus the corpus to be populated.
* @param singleConcatenatedFile the trec file.
* @param documentRootElement text between this element (start and
* end) is considered for creating a new document.
* @param encoding the encoding of the trec file.
* @param numberOfDocumentsToExtract extracts the specified number of
* documents from the trecweb file; -1 to indicate all files.
* @param mimeType the mime type which determines how the document is handled
* @return total length of populated documents in the corpus in number
* of bytes
* @throws java.io.IOException
*/
public static long populate(Corpus corpus, URL singleConcatenatedFile, String documentRootElement, String encoding, int numberOfDocumentsToExtract, String documentNamePrefix, String mimeType, boolean includeRootElement) throws IOException {
StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
// obtain the root element that user has provided
// content between the start and end of root element is considered
// for creating documents
documentRootElement = documentRootElement.toLowerCase();
// document name prefix could be an empty string
documentNamePrefix = documentNamePrefix == null ? "" : documentNamePrefix.trim() + "_";
// we start a new document when we find <documentRootElement> and
// close it when we find </documentRootElement>
BufferedReader br = null;
try {
if (encoding != null && encoding.trim().length() != 0) {
br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), encoding, 10485760);
} else {
br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), 10485760);
}
// reading line by line
String line = br.readLine();
// this is where we store document content
StringBuilder documentString = new StringBuilder();
// toggle switch to indicate search for start element
boolean searchingForStartElement = true;
// keeping count of number of documents extracted
int count = 1;
// length in bytes read so far (to return)
long lengthInBytes = 0;
// continue until reached the end of file
while (line != null) {
// lowercase the line in order to match documentRootElement in any case
String lowerCasedLine = line.toLowerCase();
// if searching for startElement?
if (searchingForStartElement) {
// may be its with attributes
int index = lowerCasedLine.indexOf("<" + documentRootElement + " ");
// may be no attributes?
if (index == -1) {
index = lowerCasedLine.indexOf("<" + documentRootElement + ">");
}
// skip the current line and start reading from the next line
if (index != -1) {
// if found, that's the first line
line = line.substring(index);
searchingForStartElement = false;
} else {
line = br.readLine();
}
} else {
// now searching for last element
int index = lowerCasedLine.indexOf("</" + documentRootElement + ">");
// if not found.. this is the content of a new document
if (index == -1) {
documentString.append(line + "\n");
line = br.readLine();
} else {
// found.. then end the document
documentString.append(line.substring(0, index + documentRootElement.length() + 3));
// getting ready for the next document
searchingForStartElement = true;
// here lets create a new document create the doc
if (sListener != null)
sListener.statusChanged("Creating Document Number :" + count);
String docName = documentNamePrefix + count + "_" + Gate.genSym();
String docContent = documentString.toString();
if (!includeRootElement)
docContent = docContent.substring(docContent.indexOf(">") + 1, docContent.lastIndexOf("<"));
FeatureMap params = Factory.newFeatureMap();
if (mimeType != null)
params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, docContent);
if (encoding != null && encoding.trim().length() > 0)
params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
// calculate the length
lengthInBytes += docContent.getBytes().length;
try {
Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
count++;
corpus.add(doc);
if (corpus.getLRPersistenceId() != null) {
// persistent corpus -> unload the document
corpus.unloadDocument(doc);
Factory.deleteResource(doc);
}
// already extracted requested num of documents?
if ((count - 1) == numberOfDocumentsToExtract)
break;
} catch (Throwable t) {
String nl = Strings.getNl();
Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + " Document name was: " + docName + nl + " Exception was: " + t + nl + nl);
t.printStackTrace();
}
documentString = new StringBuilder();
if (sListener != null)
sListener.statusChanged(docName + " created!");
line = line.substring(index + documentRootElement.length() + 3);
if (line.trim().equals(""))
line = br.readLine();
}
}
}
return lengthInBytes;
} finally {
if (br != null)
br.close();
}
}
use of gate.Document in project gate-core by GateNLP.
the class CorpusImpl method populate.
/**
* Fills the provided corpus with documents created on the fly from
* selected files in a directory. Uses a {@link FileFilter} to select
* which files will be used and which will be ignored. A simple file
* filter based on extensions is provided in the Gate distribution (
* {@link gate.util.ExtensionFileFilter}).
*
* @param corpus the corpus to be populated
* @param directory the directory from which the files will be picked.
* This parameter is an URL for uniformity. It needs to be a
* URL of type file otherwise an InvalidArgumentException
* will be thrown.
* @param filter the file filter used to select files from the target
* directory. If the filter is <tt>null</tt> all the files
* will be accepted.
* @param encoding the encoding to be used for reading the documents
* @param recurseDirectories should the directory be parsed
* recursively?. If <tt>true</tt> all the files from the
* provided directory and all its children directories (on as
* many levels as necessary) will be picked if accepted by
* the filter otherwise the children directories will be
* ignored.
* @throws java.io.IOException if a file doesn't exist
*/
public static void populate(Corpus corpus, URL directory, FileFilter filter, String encoding, String mimeType, boolean recurseDirectories) throws IOException {
// check input
if (!directory.getProtocol().equalsIgnoreCase("file"))
throw new IllegalArgumentException("The URL provided is not of type \"file:\"!");
File dir = Files.fileFromURL(directory);
if (!dir.exists())
throw new FileNotFoundException(dir.toString());
if (!dir.isDirectory())
throw new IllegalArgumentException(dir.getAbsolutePath() + " is not a directory!");
File[] files;
// populate the corpus
if (recurseDirectories) {
files = Files.listFilesRecursively(dir, filter);
} else {
files = dir.listFiles(filter);
}
if (files == null) {
return;
}
// sort the files alphabetically regardless of their paths
Arrays.sort(files, new Comparator<File>() {
@Override
public int compare(File f1, File f2) {
return f1.getName().compareTo(f2.getName());
}
});
// create the GATE documents
for (File file : files) {
if (file.isDirectory()) {
continue;
}
StatusListener sListener = (StatusListener) Gate.getListeners().get("gate.event.StatusListener");
if (sListener != null)
sListener.statusChanged("Reading: " + file.getName());
String docName = file.getName() + "_" + Gate.genSym();
FeatureMap params = Factory.newFeatureMap();
params.put(Document.DOCUMENT_URL_PARAMETER_NAME, file.toURI().toURL());
if (encoding != null)
params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
if (mimeType != null)
params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
try {
Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
corpus.add(doc);
if (corpus.getLRPersistenceId() != null) {
// persistent corpus -> unload the document
corpus.unloadDocument(doc);
Factory.deleteResource(doc);
}
} catch (Throwable t) {
String nl = Strings.getNl();
Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + " Document name was: " + docName + nl + " Exception was: " + t + nl + nl);
t.printStackTrace();
}
if (sListener != null)
sListener.statusChanged(file.getName() + " read");
}
}
use of gate.Document in project gate-core by GateNLP.
the class LuceneDataStoreImpl method sync.
/**
* Save: synchonise the in-memory image of the LR with the persistent
* image.
*/
@Override
public void sync(LanguageResource lr) throws PersistenceException {
if (lr.getLRPersistenceId() != null) {
// lock the LR ID so we don't write to the file while an
// indexer task is reading it
Object lock = lockObjectForID(lr.getLRPersistenceId());
synchronized (lock) {
// we load the copy of this LR and check if any modification were done
// if so, it should be reindexed or else it should not be synced again.
LanguageResource copy = null;
try {
copy = getLr(lr.getClass().getName(), lr.getLRPersistenceId());
// we check it only if it is an instance of Document
if (copy instanceof Document && lr instanceof Document) {
Document cDoc = (Document) copy;
Document lrDoc = (Document) lr;
boolean sameDocs = false;
// as that's what matters from the annic perspective
if (cDoc.getContent().equals(lrDoc.getContent())) {
if (cDoc.getAnnotations().equals(lrDoc.getAnnotations())) {
if (cDoc.getNamedAnnotationSets().equals(lrDoc.getNamedAnnotationSets())) {
boolean allSetsSame = true;
for (String key : cDoc.getNamedAnnotationSets().keySet()) {
if (!cDoc.getAnnotations(key).equals(lrDoc.getAnnotations(key))) {
allSetsSame = false;
break;
}
}
if (allSetsSame) {
sameDocs = true;
}
}
}
}
if (sameDocs) {
lock = null;
return;
}
}
} catch (SecurityException e) {
e.printStackTrace();
} finally {
// delete the copy of this LR
if (copy != null) {
Factory.deleteResource(copy);
}
}
super.sync(lr);
}
lock = null;
} else {
super.sync(lr);
}
if (lr instanceof Document) {
queueForIndexing(lr.getLRPersistenceId());
}
}
use of gate.Document in project gate-core by GateNLP.
the class SerialDataStore method sync.
// close()
/**
* Save: synchonise the in-memory image of the LR with the persistent
* image.
*/
@Override
public void sync(LanguageResource lr) throws PersistenceException {
// check that this LR is one of ours (i.e. has been adopted)
if (lr.getDataStore() == null || !lr.getDataStore().equals(this))
throw new PersistenceException("LR " + lr.getName() + " has not been adopted by this DataStore");
// find the resource data for this LR
ResourceData lrData = Gate.getCreoleRegister().get(lr.getClass().getName());
// create a subdirectory for resources of this type if none exists
File resourceTypeDirectory = new File(storageDir, lrData.getClassName());
if ((!resourceTypeDirectory.exists()) || (!resourceTypeDirectory.isDirectory())) {
// create the directory in the meantime
if (!resourceTypeDirectory.mkdir() && !resourceTypeDirectory.exists())
throw new PersistenceException("Can't write " + resourceTypeDirectory);
}
// create an indentifier for this resource
String lrName = null;
Object lrPersistenceId = null;
lrName = lr.getName();
lrPersistenceId = lr.getLRPersistenceId();
if (lrName == null)
lrName = lrData.getName();
if (lrPersistenceId == null) {
lrPersistenceId = constructPersistenceId(lrName);
lr.setLRPersistenceId(lrPersistenceId);
}
// we're saving a corpus. I need to save its documents first
if (lr instanceof Corpus) {
// check if the corpus is the one we support. CorpusImpl cannot be saved!
if (!(lr instanceof SerialCorpusImpl))
throw new PersistenceException("Can't save a corpus which " + "is not of type SerialCorpusImpl!");
SerialCorpusImpl corpus = (SerialCorpusImpl) lr;
// corresponding document IDs
for (int i = 0; i < corpus.size(); i++) {
// if the document is not in memory, there's little point in saving it
if ((!corpus.isDocumentLoaded(i)) && corpus.isPersistentDocument(i))
continue;
if (DEBUG)
Out.prln("Saving document at position " + i);
if (DEBUG)
Out.prln("Document in memory " + corpus.isDocumentLoaded(i));
if (DEBUG)
Out.prln("is persistent? " + corpus.isPersistentDocument(i));
if (DEBUG)
Out.prln("Document name at position" + corpus.getDocumentName(i));
Document doc = corpus.get(i);
try {
// if the document is not already adopted, we need to do that first
if (doc.getLRPersistenceId() == null) {
if (DEBUG)
Out.prln("Document adopted" + doc.getName());
doc = (Document) this.adopt(doc);
this.sync(doc);
if (DEBUG)
Out.prln("Document sync-ed");
corpus.setDocumentPersistentID(i, doc.getLRPersistenceId());
} else {
// if it is adopted, just sync it
this.sync(doc);
if (DEBUG)
Out.prln("Document sync-ed");
}
// store the persistent ID. Needs to be done even if the document was
// already adopted, in case the doc was already persistent
// when added to the corpus
corpus.setDocumentPersistentID(i, doc.getLRPersistenceId());
if (DEBUG)
Out.prln("new document ID " + doc.getLRPersistenceId());
} catch (Exception ex) {
throw new PersistenceException("Error while saving corpus: " + corpus + "because of an error storing document " + ex.getMessage(), ex);
}
}
// for loop through documents
}
// create a File to store the resource in
File resourceFile = new File(resourceTypeDirectory, (String) lrPersistenceId);
// dump the LR into the new File
try {
OutputStream os = new FileOutputStream(resourceFile);
// after 1.1 the serialised files are compressed
if (!currentProtocolVersion.equals("1.0"))
os = new GZIPOutputStream(os);
os = new BufferedOutputStream(os);
ObjectOutputStream oos = new ObjectOutputStream(os);
oos.writeObject(lr);
oos.close();
} catch (IOException e) {
throw new PersistenceException("Couldn't write to storage file: " + e.getMessage(), e);
}
// let the world know about it
fireResourceWritten(new DatastoreEvent(this, DatastoreEvent.RESOURCE_WRITTEN, lr, lrPersistenceId));
}
Aggregations