use of gate.event.StatusListener in project gate-core by GateNLP.
the class CorpusImpl method populate.
/**
* Fills the provided corpus with documents extracted from the
* provided trec file.
*
* @param corpus the corpus to be populated.
* @param singleConcatenatedFile the trec file.
* @param documentRootElement text between this element (start and
* end) is considered for creating a new document.
* @param encoding the encoding of the trec file.
* @param numberOfDocumentsToExtract extracts the specified number of
* documents from the trecweb file; -1 to indicate all files.
* @param mimeType the mime type which determines how the document is handled
* @return total length of populated documents in the corpus in number
* of bytes
* @throws java.io.IOException
*/
public static long populate(Corpus corpus, URL singleConcatenatedFile, String documentRootElement, String encoding, int numberOfDocumentsToExtract, String documentNamePrefix, String mimeType, boolean includeRootElement) throws IOException {
StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
// obtain the root element that user has provided
// content between the start and end of root element is considered
// for creating documents
documentRootElement = documentRootElement.toLowerCase();
// document name prefix could be an empty string
documentNamePrefix = documentNamePrefix == null ? "" : documentNamePrefix.trim() + "_";
// we start a new document when we find <documentRootElement> and
// close it when we find </documentRootElement>
BufferedReader br = null;
try {
if (encoding != null && encoding.trim().length() != 0) {
br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), encoding, 10485760);
} else {
br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), 10485760);
}
// reading line by line
String line = br.readLine();
// this is where we store document content
StringBuilder documentString = new StringBuilder();
// toggle switch to indicate search for start element
boolean searchingForStartElement = true;
// keeping count of number of documents extracted
int count = 1;
// length in bytes read so far (to return)
long lengthInBytes = 0;
// continue until reached the end of file
while (line != null) {
// lowercase the line in order to match documentRootElement in any case
String lowerCasedLine = line.toLowerCase();
// if searching for startElement?
if (searchingForStartElement) {
// may be its with attributes
int index = lowerCasedLine.indexOf("<" + documentRootElement + " ");
// may be no attributes?
if (index == -1) {
index = lowerCasedLine.indexOf("<" + documentRootElement + ">");
}
// skip the current line and start reading from the next line
if (index != -1) {
// if found, that's the first line
line = line.substring(index);
searchingForStartElement = false;
} else {
line = br.readLine();
}
} else {
// now searching for last element
int index = lowerCasedLine.indexOf("</" + documentRootElement + ">");
// if not found.. this is the content of a new document
if (index == -1) {
documentString.append(line + "\n");
line = br.readLine();
} else {
// found.. then end the document
documentString.append(line.substring(0, index + documentRootElement.length() + 3));
// getting ready for the next document
searchingForStartElement = true;
// here lets create a new document create the doc
if (sListener != null)
sListener.statusChanged("Creating Document Number :" + count);
String docName = documentNamePrefix + count + "_" + Gate.genSym();
String docContent = documentString.toString();
if (!includeRootElement)
docContent = docContent.substring(docContent.indexOf(">") + 1, docContent.lastIndexOf("<"));
FeatureMap params = Factory.newFeatureMap();
if (mimeType != null)
params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, docContent);
if (encoding != null && encoding.trim().length() > 0)
params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
// calculate the length
lengthInBytes += docContent.getBytes().length;
try {
Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
count++;
corpus.add(doc);
if (corpus.getLRPersistenceId() != null) {
// persistent corpus -> unload the document
corpus.unloadDocument(doc);
Factory.deleteResource(doc);
}
// already extracted requested num of documents?
if ((count - 1) == numberOfDocumentsToExtract)
break;
} catch (Throwable t) {
String nl = Strings.getNl();
Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + " Document name was: " + docName + nl + " Exception was: " + t + nl + nl);
t.printStackTrace();
}
documentString = new StringBuilder();
if (sListener != null)
sListener.statusChanged(docName + " created!");
line = line.substring(index + documentRootElement.length() + 3);
if (line.trim().equals(""))
line = br.readLine();
}
}
}
return lengthInBytes;
} finally {
if (br != null)
br.close();
}
}
use of gate.event.StatusListener in project gate-core by GateNLP.
the class CorpusImpl method populate.
/**
* Fills the provided corpus with documents created on the fly from
* selected files in a directory. Uses a {@link FileFilter} to select
* which files will be used and which will be ignored. A simple file
* filter based on extensions is provided in the Gate distribution (
* {@link gate.util.ExtensionFileFilter}).
*
* @param corpus the corpus to be populated
* @param directory the directory from which the files will be picked.
* This parameter is an URL for uniformity. It needs to be a
* URL of type file otherwise an InvalidArgumentException
* will be thrown.
* @param filter the file filter used to select files from the target
* directory. If the filter is <tt>null</tt> all the files
* will be accepted.
* @param encoding the encoding to be used for reading the documents
* @param recurseDirectories should the directory be parsed
* recursively?. If <tt>true</tt> all the files from the
* provided directory and all its children directories (on as
* many levels as necessary) will be picked if accepted by
* the filter otherwise the children directories will be
* ignored.
* @throws java.io.IOException if a file doesn't exist
*/
public static void populate(Corpus corpus, URL directory, FileFilter filter, String encoding, String mimeType, boolean recurseDirectories) throws IOException {
// check input
if (!directory.getProtocol().equalsIgnoreCase("file"))
throw new IllegalArgumentException("The URL provided is not of type \"file:\"!");
File dir = Files.fileFromURL(directory);
if (!dir.exists())
throw new FileNotFoundException(dir.toString());
if (!dir.isDirectory())
throw new IllegalArgumentException(dir.getAbsolutePath() + " is not a directory!");
File[] files;
// populate the corpus
if (recurseDirectories) {
files = Files.listFilesRecursively(dir, filter);
} else {
files = dir.listFiles(filter);
}
if (files == null) {
return;
}
// sort the files alphabetically regardless of their paths
Arrays.sort(files, new Comparator<File>() {
@Override
public int compare(File f1, File f2) {
return f1.getName().compareTo(f2.getName());
}
});
// create the GATE documents
for (File file : files) {
if (file.isDirectory()) {
continue;
}
StatusListener sListener = (StatusListener) Gate.getListeners().get("gate.event.StatusListener");
if (sListener != null)
sListener.statusChanged("Reading: " + file.getName());
String docName = file.getName() + "_" + Gate.genSym();
FeatureMap params = Factory.newFeatureMap();
params.put(Document.DOCUMENT_URL_PARAMETER_NAME, file.toURI().toURL());
if (encoding != null)
params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
if (mimeType != null)
params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
try {
Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
corpus.add(doc);
if (corpus.getLRPersistenceId() != null) {
// persistent corpus -> unload the document
corpus.unloadDocument(doc);
Factory.deleteResource(doc);
}
} catch (Throwable t) {
String nl = Strings.getNl();
Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + " Document name was: " + docName + nl + " Exception was: " + t + nl + nl);
t.printStackTrace();
}
if (sListener != null)
sListener.statusChanged(file.getName() + " read");
}
}
use of gate.event.StatusListener in project gate-core by GateNLP.
the class PersistenceManager method saveObjectToFile.
/**
* Save the given object to the given file.
*
* @param obj The object to persist.
* @param file The file where to persist to
* @param usegatehome if true (recommended) use $gatehome$ and $resourceshome$ instead of
* $relpath$ in any saved path URLs if the location of that URL is inside GATE home or
* inside the resources home directory (if set).
* @param warnaboutgatehome if true, issue a warning message when a saved URL uses $gatehome$
* or $resourceshome$.
* @throws PersistenceException
* @throws IOException
*/
public static void saveObjectToFile(Object obj, File file, boolean usegatehome, boolean warnaboutgatehome) throws PersistenceException, IOException {
ProgressListener pListener = (ProgressListener) Gate.getListeners().get("gate.event.ProgressListener");
StatusListener sListener = (gate.event.StatusListener) Gate.getListeners().get("gate.event.StatusListener");
long startTime = System.currentTimeMillis();
if (pListener != null)
pListener.progressChanged(0);
// The object output stream is used for native serialization,
// but the xstream and filewriter are used for XML serialization.
ObjectOutputStream oos = null;
com.thoughtworks.xstream.XStream xstream = null;
HierarchicalStreamWriter writer = null;
warnAboutGateHome.get().addFirst(warnaboutgatehome);
useGateHome.get().addFirst(usegatehome);
startPersistingTo(file);
try {
if (Gate.getUseXMLSerialization()) {
// Just create the xstream and the filewriter that will later be
// used to serialize objects.
xstream = new XStream(new SunUnsafeReflectionProvider(new FieldDictionary(new XStream12FieldKeySorter())), new StaxDriver(new XStream11NameCoder())) {
@Override
protected boolean useXStream11XmlFriendlyMapper() {
return true;
}
};
FileWriter fileWriter = new FileWriter(file);
writer = new PrettyPrintWriter(fileWriter, new XmlFriendlyNameCoder("-", "_"));
} else {
oos = new ObjectOutputStream(new FileOutputStream(file));
}
Object persistentList = getPersistentRepresentation(Gate.getCreoleRegister().getPlugins());
Object persistentObject = getPersistentRepresentation(obj);
if (Gate.getUseXMLSerialization()) {
// We need to put the urls and the application itself together
// as xstreams can only hold one object.
GateApplication gateApplication = new GateApplication();
// gateApplication.workspace = new File("cache");
gateApplication.urlList = persistentList;
gateApplication.application = persistentObject;
// Then do the actual serialization.
xstream.marshal(gateApplication, writer);
} else {
// This is for native serialization.
oos.writeObject(persistentList);
// now write the object
oos.writeObject(persistentObject);
}
} finally {
finishedPersisting();
if (oos != null) {
oos.flush();
oos.close();
}
if (writer != null) {
// Just make sure that all the xml is written, and the file
// closed.
writer.flush();
writer.close();
}
long endTime = System.currentTimeMillis();
if (sListener != null)
sListener.statusChanged("Storing completed in " + NumberFormat.getInstance().format((double) (endTime - startTime) / 1000) + " seconds");
if (pListener != null)
pListener.processFinished();
}
}
use of gate.event.StatusListener in project gate-core by GateNLP.
the class PersistenceManager method loadObjectFromUrl.
public static Object loadObjectFromUrl(URL url) throws PersistenceException, IOException, ResourceInstantiationException {
if (!Gate.isInitialised())
throw new ResourceInstantiationException("You must call Gate.init() before you can restore resources");
ProgressListener pListener = (ProgressListener) Gate.getListeners().get("gate.event.ProgressListener");
StatusListener sListener = (gate.event.StatusListener) Gate.getListeners().get("gate.event.StatusListener");
if (pListener != null)
pListener.progressChanged(0);
startLoadingFrom(url);
// the actual stream obtained from the URL. We keep a reference to this
// so we can ensure it gets closed.
InputStream rawStream = null;
try {
long startTime = System.currentTimeMillis();
// Determine whether the file contains an application serialized in
// xml
// format. Otherwise we will assume that it contains native
// serializations.
boolean xmlStream = isXmlApplicationFile(url);
ObjectInputStream ois = null;
HierarchicalStreamReader reader = null;
XStream xstream = null;
// whether serialization is native or xml.
if (xmlStream) {
// we don't want to strip the BOM on XML.
Reader inputReader = new InputStreamReader(rawStream = url.openStream());
try {
XMLInputFactory inputFactory = XMLInputFactory.newInstance();
inputFactory.setProperty(XMLInputFactory.IS_COALESCING, true);
XMLStreamReader xsr = inputFactory.createXMLStreamReader(url.toExternalForm(), inputReader);
reader = new StaxReader(new QNameMap(), xsr);
} catch (XMLStreamException xse) {
// make sure the stream is closed, on error
inputReader.close();
throw new PersistenceException("Error creating reader", xse);
}
xstream = new XStream(new StaxDriver(new XStream11NameCoder())) {
@Override
protected boolean useXStream11XmlFriendlyMapper() {
return true;
}
};
// make XStream load classes through the GATE ClassLoader
xstream.setClassLoader(Gate.getClassLoader());
// make the XML stream appear as a normal ObjectInputStream
ois = xstream.createObjectInputStream(reader);
} else {
// use GateAwareObjectInputStream to load classes through the
// GATE ClassLoader if they can't be loaded through the one
// ObjectInputStream would normally use
ois = new GateAwareObjectInputStream(url.openStream());
}
Object res = null;
try {
// first read the list of creole URLs.
@SuppressWarnings("unchecked") Iterator<?> urlIter = ((Collection<?>) getTransientRepresentation(ois.readObject())).iterator();
// and re-register them
while (urlIter.hasNext()) {
Object anUrl = urlIter.next();
try {
if (anUrl instanceof URL)
Gate.getCreoleRegister().registerPlugin(new Plugin.Directory((URL) anUrl), false);
else if (anUrl instanceof Plugin)
Gate.getCreoleRegister().registerPlugin((Plugin) anUrl, false);
} catch (GateException ge) {
System.out.println("We've hit an error!");
ge.printStackTrace();
ge.printStackTrace(Err.getPrintWriter());
Err.prln("Could not reload creole directory " + anUrl);
}
}
// now we can read the saved object in the presence of all
// the right plugins
res = ois.readObject();
// ensure a fresh start
clearCurrentTransients();
res = getTransientRepresentation(res);
long endTime = System.currentTimeMillis();
if (sListener != null)
sListener.statusChanged("Loading completed in " + NumberFormat.getInstance().format((double) (endTime - startTime) / 1000) + " seconds");
return res;
} catch (ResourceInstantiationException rie) {
if (sListener != null)
sListener.statusChanged("Failure during instantiation of resources.");
throw rie;
} catch (PersistenceException pe) {
if (sListener != null)
sListener.statusChanged("Failure during persistence operations.");
throw pe;
} catch (Exception ex) {
if (sListener != null)
sListener.statusChanged("Loading failed!");
throw new PersistenceException(ex);
} finally {
// make sure the stream gets closed
if (ois != null)
ois.close();
if (reader != null)
reader.close();
}
} finally {
if (rawStream != null)
rawStream.close();
finishedLoading();
if (pListener != null)
pListener.processFinished();
}
}
use of gate.event.StatusListener in project gate-core by GateNLP.
the class EmailDocumentFormat method unpackMarkup.
/**
* Unpack the markup in the document. This converts markup from the
* native format (e.g. EMAIL) into annotations in GATE format.
* Uses the markupElementsMap to determine which elements to convert, and
* what annotation type names to use.
* It always tryes to parse te doc's content. It doesn't matter if the
* sourceUrl is null or not.
*
* @param doc The gate document you want to parse.
*/
@Override
public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
setNewLineProperty(doc);
// create an EmailDocumentHandler
EmailDocumentHandler emailDocHandler = null;
emailDocHandler = new gate.email.EmailDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
// this is implemented in DocumentFormat.java and inherited here
fireStatusChanged(text);
}
};
// Register a status listener with it
emailDocHandler.addStatusListener(statusListener);
try {
// Call the method that creates annotations on the gate document
emailDocHandler.annotateMessages();
// Process the body annotations and search for paragraphs
AnnotationSet bodyAnnotations = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body");
if (bodyAnnotations != null && !bodyAnnotations.isEmpty()) {
Iterator<Annotation> iter = bodyAnnotations.iterator();
while (iter.hasNext()) {
Annotation a = iter.next();
annotateParagraphs(doc, a.getStartNode().getOffset().intValue(), a.getEndNode().getOffset().intValue(), GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
}
// End while
}
// End if
} catch (IOException e) {
throw new DocumentFormatException("Couldn't create a buffered reader ", e);
} catch (InvalidOffsetException e) {
throw new DocumentFormatException(e);
} finally {
emailDocHandler.removeStatusListener(statusListener);
}
// End try
}
Aggregations