Search in sources :

Example 11 with StatusListener

use of gate.event.StatusListener in project gate-core by GateNLP.

the class CorpusImpl method populate.

/**
 * Fills the provided corpus with documents extracted from the
 * provided trec file.
 *
 * @param corpus the corpus to be populated.
 * @param singleConcatenatedFile the trec file.
 * @param documentRootElement text between this element (start and
 *          end) is considered for creating a new document.
 * @param encoding the encoding of the trec file.
 * @param numberOfDocumentsToExtract extracts the specified number of
 *          documents from the trecweb file; -1 to indicate all files.
 * @param mimeType the mime type which determines how the document is handled
 * @return total length of populated documents in the corpus in number
 *         of bytes
 * @throws java.io.IOException
 */
public static long populate(Corpus corpus, URL singleConcatenatedFile, String documentRootElement, String encoding, int numberOfDocumentsToExtract, String documentNamePrefix, String mimeType, boolean includeRootElement) throws IOException {
    StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
    // obtain the root element that user has provided
    // content between the start and end of root element is considered
    // for creating documents
    documentRootElement = documentRootElement.toLowerCase();
    // document name prefix could be an empty string
    documentNamePrefix = documentNamePrefix == null ? "" : documentNamePrefix.trim() + "_";
    // we start a new document when we find <documentRootElement> and
    // close it when we find </documentRootElement>
    BufferedReader br = null;
    try {
        if (encoding != null && encoding.trim().length() != 0) {
            br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), encoding, 10485760);
        } else {
            br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), 10485760);
        }
        // reading line by line
        String line = br.readLine();
        // this is where we store document content
        StringBuilder documentString = new StringBuilder();
        // toggle switch to indicate search for start element
        boolean searchingForStartElement = true;
        // keeping count of number of documents extracted
        int count = 1;
        // length in bytes read so far (to return)
        long lengthInBytes = 0;
        // continue until reached the end of file
        while (line != null) {
            // lowercase the line in order to match documentRootElement in any case
            String lowerCasedLine = line.toLowerCase();
            // if searching for startElement?
            if (searchingForStartElement) {
                // may be its with attributes
                int index = lowerCasedLine.indexOf("<" + documentRootElement + " ");
                // may be no attributes?
                if (index == -1) {
                    index = lowerCasedLine.indexOf("<" + documentRootElement + ">");
                }
                // skip the current line and start reading from the next line
                if (index != -1) {
                    // if found, that's the first line
                    line = line.substring(index);
                    searchingForStartElement = false;
                } else {
                    line = br.readLine();
                }
            } else {
                // now searching for last element
                int index = lowerCasedLine.indexOf("</" + documentRootElement + ">");
                // if not found.. this is the content of a new document
                if (index == -1) {
                    documentString.append(line + "\n");
                    line = br.readLine();
                } else {
                    // found.. then end the document
                    documentString.append(line.substring(0, index + documentRootElement.length() + 3));
                    // getting ready for the next document
                    searchingForStartElement = true;
                    // here lets create a new document create the doc
                    if (sListener != null)
                        sListener.statusChanged("Creating Document Number :" + count);
                    String docName = documentNamePrefix + count + "_" + Gate.genSym();
                    String docContent = documentString.toString();
                    if (!includeRootElement)
                        docContent = docContent.substring(docContent.indexOf(">") + 1, docContent.lastIndexOf("<"));
                    FeatureMap params = Factory.newFeatureMap();
                    if (mimeType != null)
                        params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
                    params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, docContent);
                    if (encoding != null && encoding.trim().length() > 0)
                        params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
                    // calculate the length
                    lengthInBytes += docContent.getBytes().length;
                    try {
                        Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
                        count++;
                        corpus.add(doc);
                        if (corpus.getLRPersistenceId() != null) {
                            // persistent corpus -> unload the document
                            corpus.unloadDocument(doc);
                            Factory.deleteResource(doc);
                        }
                        // already extracted requested num of documents?
                        if ((count - 1) == numberOfDocumentsToExtract)
                            break;
                    } catch (Throwable t) {
                        String nl = Strings.getNl();
                        Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + "  Document name was: " + docName + nl + "  Exception was: " + t + nl + nl);
                        t.printStackTrace();
                    }
                    documentString = new StringBuilder();
                    if (sListener != null)
                        sListener.statusChanged(docName + " created!");
                    line = line.substring(index + documentRootElement.length() + 3);
                    if (line.trim().equals(""))
                        line = br.readLine();
                }
            }
        }
        return lengthInBytes;
    } finally {
        if (br != null)
            br.close();
    }
}
Also used : FeatureMap(gate.FeatureMap) BomStrippingInputStreamReader(gate.util.BomStrippingInputStreamReader) BufferedReader(java.io.BufferedReader) StatusListener(gate.event.StatusListener) Document(gate.Document)

Example 12 with StatusListener

use of gate.event.StatusListener in project gate-core by GateNLP.

the class CorpusImpl method populate.

/**
 * Fills the provided corpus with documents created on the fly from
 * selected files in a directory. Uses a {@link FileFilter} to select
 * which files will be used and which will be ignored. A simple file
 * filter based on extensions is provided in the Gate distribution (
 * {@link gate.util.ExtensionFileFilter}).
 *
 * @param corpus the corpus to be populated
 * @param directory the directory from which the files will be picked.
 *          This parameter is an URL for uniformity. It needs to be a
 *          URL of type file otherwise an InvalidArgumentException
 *          will be thrown.
 * @param filter the file filter used to select files from the target
 *          directory. If the filter is <tt>null</tt> all the files
 *          will be accepted.
 * @param encoding the encoding to be used for reading the documents
 * @param recurseDirectories should the directory be parsed
 *          recursively?. If <tt>true</tt> all the files from the
 *          provided directory and all its children directories (on as
 *          many levels as necessary) will be picked if accepted by
 *          the filter otherwise the children directories will be
 *          ignored.
 * @throws java.io.IOException if a file doesn't exist
 */
public static void populate(Corpus corpus, URL directory, FileFilter filter, String encoding, String mimeType, boolean recurseDirectories) throws IOException {
    // check input
    if (!directory.getProtocol().equalsIgnoreCase("file"))
        throw new IllegalArgumentException("The URL provided is not of type \"file:\"!");
    File dir = Files.fileFromURL(directory);
    if (!dir.exists())
        throw new FileNotFoundException(dir.toString());
    if (!dir.isDirectory())
        throw new IllegalArgumentException(dir.getAbsolutePath() + " is not a directory!");
    File[] files;
    // populate the corpus
    if (recurseDirectories) {
        files = Files.listFilesRecursively(dir, filter);
    } else {
        files = dir.listFiles(filter);
    }
    if (files == null) {
        return;
    }
    // sort the files alphabetically regardless of their paths
    Arrays.sort(files, new Comparator<File>() {

        @Override
        public int compare(File f1, File f2) {
            return f1.getName().compareTo(f2.getName());
        }
    });
    // create the GATE documents
    for (File file : files) {
        if (file.isDirectory()) {
            continue;
        }
        StatusListener sListener = (StatusListener) Gate.getListeners().get("gate.event.StatusListener");
        if (sListener != null)
            sListener.statusChanged("Reading: " + file.getName());
        String docName = file.getName() + "_" + Gate.genSym();
        FeatureMap params = Factory.newFeatureMap();
        params.put(Document.DOCUMENT_URL_PARAMETER_NAME, file.toURI().toURL());
        if (encoding != null)
            params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
        if (mimeType != null)
            params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
        try {
            Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
            corpus.add(doc);
            if (corpus.getLRPersistenceId() != null) {
                // persistent corpus -> unload the document
                corpus.unloadDocument(doc);
                Factory.deleteResource(doc);
            }
        } catch (Throwable t) {
            String nl = Strings.getNl();
            Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + "  Document name was: " + docName + nl + "  Exception was: " + t + nl + nl);
            t.printStackTrace();
        }
        if (sListener != null)
            sListener.statusChanged(file.getName() + " read");
    }
}
Also used : FeatureMap(gate.FeatureMap) FileNotFoundException(java.io.FileNotFoundException) StatusListener(gate.event.StatusListener) Document(gate.Document) File(java.io.File)

Example 13 with StatusListener

use of gate.event.StatusListener in project gate-core by GateNLP.

the class PersistenceManager method saveObjectToFile.

/**
 * Save the given object to the given file.
 *
 * @param obj The object to persist.
 * @param file The file where to persist to
 * @param usegatehome if true (recommended) use $gatehome$ and $resourceshome$ instead of
 * $relpath$ in any saved path URLs if the location of that URL is inside GATE home or
 * inside the resources home directory (if set).
 * @param warnaboutgatehome if true, issue a warning message when a saved URL uses $gatehome$
 * or $resourceshome$.
 * @throws PersistenceException
 * @throws IOException
 */
public static void saveObjectToFile(Object obj, File file, boolean usegatehome, boolean warnaboutgatehome) throws PersistenceException, IOException {
    ProgressListener pListener = (ProgressListener) Gate.getListeners().get("gate.event.ProgressListener");
    StatusListener sListener = (gate.event.StatusListener) Gate.getListeners().get("gate.event.StatusListener");
    long startTime = System.currentTimeMillis();
    if (pListener != null)
        pListener.progressChanged(0);
    // The object output stream is used for native serialization,
    // but the xstream and filewriter are used for XML serialization.
    ObjectOutputStream oos = null;
    com.thoughtworks.xstream.XStream xstream = null;
    HierarchicalStreamWriter writer = null;
    warnAboutGateHome.get().addFirst(warnaboutgatehome);
    useGateHome.get().addFirst(usegatehome);
    startPersistingTo(file);
    try {
        if (Gate.getUseXMLSerialization()) {
            // Just create the xstream and the filewriter that will later be
            // used to serialize objects.
            xstream = new XStream(new SunUnsafeReflectionProvider(new FieldDictionary(new XStream12FieldKeySorter())), new StaxDriver(new XStream11NameCoder())) {

                @Override
                protected boolean useXStream11XmlFriendlyMapper() {
                    return true;
                }
            };
            FileWriter fileWriter = new FileWriter(file);
            writer = new PrettyPrintWriter(fileWriter, new XmlFriendlyNameCoder("-", "_"));
        } else {
            oos = new ObjectOutputStream(new FileOutputStream(file));
        }
        Object persistentList = getPersistentRepresentation(Gate.getCreoleRegister().getPlugins());
        Object persistentObject = getPersistentRepresentation(obj);
        if (Gate.getUseXMLSerialization()) {
            // We need to put the urls and the application itself together
            // as xstreams can only hold one object.
            GateApplication gateApplication = new GateApplication();
            // gateApplication.workspace = new File("cache");
            gateApplication.urlList = persistentList;
            gateApplication.application = persistentObject;
            // Then do the actual serialization.
            xstream.marshal(gateApplication, writer);
        } else {
            // This is for native serialization.
            oos.writeObject(persistentList);
            // now write the object
            oos.writeObject(persistentObject);
        }
    } finally {
        finishedPersisting();
        if (oos != null) {
            oos.flush();
            oos.close();
        }
        if (writer != null) {
            // Just make sure that all the xml is written, and the file
            // closed.
            writer.flush();
            writer.close();
        }
        long endTime = System.currentTimeMillis();
        if (sListener != null)
            sListener.statusChanged("Storing completed in " + NumberFormat.getInstance().format((double) (endTime - startTime) / 1000) + " seconds");
        if (pListener != null)
            pListener.processFinished();
    }
}
Also used : XStream(com.thoughtworks.xstream.XStream) HierarchicalStreamWriter(com.thoughtworks.xstream.io.HierarchicalStreamWriter) XStream(com.thoughtworks.xstream.XStream) XStream12FieldKeySorter(com.thoughtworks.xstream.converters.reflection.XStream12FieldKeySorter) FileWriter(java.io.FileWriter) XmlFriendlyNameCoder(com.thoughtworks.xstream.io.xml.XmlFriendlyNameCoder) ObjectOutputStream(java.io.ObjectOutputStream) StaxDriver(com.thoughtworks.xstream.io.xml.StaxDriver) ProgressListener(gate.event.ProgressListener) FieldDictionary(com.thoughtworks.xstream.converters.reflection.FieldDictionary) XStream11NameCoder(com.thoughtworks.xstream.io.xml.XStream11NameCoder) FileOutputStream(java.io.FileOutputStream) PrettyPrintWriter(com.thoughtworks.xstream.io.xml.PrettyPrintWriter) StatusListener(gate.event.StatusListener) SunUnsafeReflectionProvider(com.thoughtworks.xstream.converters.reflection.SunUnsafeReflectionProvider)

Example 14 with StatusListener

use of gate.event.StatusListener in project gate-core by GateNLP.

the class PersistenceManager method loadObjectFromUrl.

public static Object loadObjectFromUrl(URL url) throws PersistenceException, IOException, ResourceInstantiationException {
    if (!Gate.isInitialised())
        throw new ResourceInstantiationException("You must call Gate.init() before you can restore resources");
    ProgressListener pListener = (ProgressListener) Gate.getListeners().get("gate.event.ProgressListener");
    StatusListener sListener = (gate.event.StatusListener) Gate.getListeners().get("gate.event.StatusListener");
    if (pListener != null)
        pListener.progressChanged(0);
    startLoadingFrom(url);
    // the actual stream obtained from the URL. We keep a reference to this
    // so we can ensure it gets closed.
    InputStream rawStream = null;
    try {
        long startTime = System.currentTimeMillis();
        // Determine whether the file contains an application serialized in
        // xml
        // format. Otherwise we will assume that it contains native
        // serializations.
        boolean xmlStream = isXmlApplicationFile(url);
        ObjectInputStream ois = null;
        HierarchicalStreamReader reader = null;
        XStream xstream = null;
        // whether serialization is native or xml.
        if (xmlStream) {
            // we don't want to strip the BOM on XML.
            Reader inputReader = new InputStreamReader(rawStream = url.openStream());
            try {
                XMLInputFactory inputFactory = XMLInputFactory.newInstance();
                inputFactory.setProperty(XMLInputFactory.IS_COALESCING, true);
                XMLStreamReader xsr = inputFactory.createXMLStreamReader(url.toExternalForm(), inputReader);
                reader = new StaxReader(new QNameMap(), xsr);
            } catch (XMLStreamException xse) {
                // make sure the stream is closed, on error
                inputReader.close();
                throw new PersistenceException("Error creating reader", xse);
            }
            xstream = new XStream(new StaxDriver(new XStream11NameCoder())) {

                @Override
                protected boolean useXStream11XmlFriendlyMapper() {
                    return true;
                }
            };
            // make XStream load classes through the GATE ClassLoader
            xstream.setClassLoader(Gate.getClassLoader());
            // make the XML stream appear as a normal ObjectInputStream
            ois = xstream.createObjectInputStream(reader);
        } else {
            // use GateAwareObjectInputStream to load classes through the
            // GATE ClassLoader if they can't be loaded through the one
            // ObjectInputStream would normally use
            ois = new GateAwareObjectInputStream(url.openStream());
        }
        Object res = null;
        try {
            // first read the list of creole URLs.
            @SuppressWarnings("unchecked") Iterator<?> urlIter = ((Collection<?>) getTransientRepresentation(ois.readObject())).iterator();
            // and re-register them
            while (urlIter.hasNext()) {
                Object anUrl = urlIter.next();
                try {
                    if (anUrl instanceof URL)
                        Gate.getCreoleRegister().registerPlugin(new Plugin.Directory((URL) anUrl), false);
                    else if (anUrl instanceof Plugin)
                        Gate.getCreoleRegister().registerPlugin((Plugin) anUrl, false);
                } catch (GateException ge) {
                    System.out.println("We've hit an error!");
                    ge.printStackTrace();
                    ge.printStackTrace(Err.getPrintWriter());
                    Err.prln("Could not reload creole directory " + anUrl);
                }
            }
            // now we can read the saved object in the presence of all
            // the right plugins
            res = ois.readObject();
            // ensure a fresh start
            clearCurrentTransients();
            res = getTransientRepresentation(res);
            long endTime = System.currentTimeMillis();
            if (sListener != null)
                sListener.statusChanged("Loading completed in " + NumberFormat.getInstance().format((double) (endTime - startTime) / 1000) + " seconds");
            return res;
        } catch (ResourceInstantiationException rie) {
            if (sListener != null)
                sListener.statusChanged("Failure during instantiation of resources.");
            throw rie;
        } catch (PersistenceException pe) {
            if (sListener != null)
                sListener.statusChanged("Failure during persistence operations.");
            throw pe;
        } catch (Exception ex) {
            if (sListener != null)
                sListener.statusChanged("Loading failed!");
            throw new PersistenceException(ex);
        } finally {
            // make sure the stream gets closed
            if (ois != null)
                ois.close();
            if (reader != null)
                reader.close();
        }
    } finally {
        if (rawStream != null)
            rawStream.close();
        finishedLoading();
        if (pListener != null)
            pListener.processFinished();
    }
}
Also used : XMLStreamReader(javax.xml.stream.XMLStreamReader) XMLStreamReader(javax.xml.stream.XMLStreamReader) Reader(java.io.Reader) HierarchicalStreamReader(com.thoughtworks.xstream.io.HierarchicalStreamReader) BomStrippingInputStreamReader(gate.util.BomStrippingInputStreamReader) StaxReader(com.thoughtworks.xstream.io.xml.StaxReader) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) URL(java.net.URL) StaxDriver(com.thoughtworks.xstream.io.xml.StaxDriver) HierarchicalStreamReader(com.thoughtworks.xstream.io.HierarchicalStreamReader) QNameMap(com.thoughtworks.xstream.io.xml.QNameMap) StaxReader(com.thoughtworks.xstream.io.xml.StaxReader) BomStrippingInputStreamReader(gate.util.BomStrippingInputStreamReader) InputStreamReader(java.io.InputStreamReader) GateAwareObjectInputStream(gate.persist.GateAwareObjectInputStream) ObjectInputStream(java.io.ObjectInputStream) GateAwareObjectInputStream(gate.persist.GateAwareObjectInputStream) InputStream(java.io.InputStream) XStream(com.thoughtworks.xstream.XStream) GateException(gate.util.GateException) URISyntaxException(java.net.URISyntaxException) XMLStreamException(javax.xml.stream.XMLStreamException) PersistenceException(gate.persist.PersistenceException) GateRuntimeException(gate.util.GateRuntimeException) ResourceInstantiationException(gate.creole.ResourceInstantiationException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) GateException(gate.util.GateException) ResourceInstantiationException(gate.creole.ResourceInstantiationException) ProgressListener(gate.event.ProgressListener) XMLStreamException(javax.xml.stream.XMLStreamException) XStream11NameCoder(com.thoughtworks.xstream.io.xml.XStream11NameCoder) PersistenceException(gate.persist.PersistenceException) Collection(java.util.Collection) StatusListener(gate.event.StatusListener) XMLInputFactory(javax.xml.stream.XMLInputFactory) ObjectInputStream(java.io.ObjectInputStream) GateAwareObjectInputStream(gate.persist.GateAwareObjectInputStream) Plugin(gate.creole.Plugin)

Example 15 with StatusListener

use of gate.event.StatusListener in project gate-core by GateNLP.

the class EmailDocumentFormat method unpackMarkup.

/**
 * Unpack the markup in the document. This converts markup from the
 * native format (e.g. EMAIL) into annotations in GATE format.
 * Uses the markupElementsMap to determine which elements to convert, and
 * what annotation type names to use.
 * It always tryes to parse te doc's content. It doesn't matter if the
 * sourceUrl is null or not.
 *
 * @param doc The gate document you want to parse.
 */
@Override
public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    setNewLineProperty(doc);
    // create an EmailDocumentHandler
    EmailDocumentHandler emailDocHandler = null;
    emailDocHandler = new gate.email.EmailDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            // this is implemented in DocumentFormat.java and inherited here
            fireStatusChanged(text);
        }
    };
    // Register a status listener with it
    emailDocHandler.addStatusListener(statusListener);
    try {
        // Call the method that creates annotations on the gate document
        emailDocHandler.annotateMessages();
        // Process the body annotations and search for paragraphs
        AnnotationSet bodyAnnotations = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body");
        if (bodyAnnotations != null && !bodyAnnotations.isEmpty()) {
            Iterator<Annotation> iter = bodyAnnotations.iterator();
            while (iter.hasNext()) {
                Annotation a = iter.next();
                annotateParagraphs(doc, a.getStartNode().getOffset().intValue(), a.getEndNode().getOffset().intValue(), GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
            }
        // End while
        }
    // End if
    } catch (IOException e) {
        throw new DocumentFormatException("Couldn't create a buffered reader ", e);
    } catch (InvalidOffsetException e) {
        throw new DocumentFormatException(e);
    } finally {
        emailDocHandler.removeStatusListener(statusListener);
    }
// End try
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) EmailDocumentHandler(gate.email.EmailDocumentHandler) EmailDocumentHandler(gate.email.EmailDocumentHandler) AnnotationSet(gate.AnnotationSet) InvalidOffsetException(gate.util.InvalidOffsetException) StatusListener(gate.event.StatusListener) IOException(java.io.IOException) Annotation(gate.Annotation)

Aggregations

StatusListener (gate.event.StatusListener)15 IOException (java.io.IOException)7 DocumentFormatException (gate.util.DocumentFormatException)6 Annotation (gate.Annotation)4 AnnotationSet (gate.AnnotationSet)4 ResourceInstantiationException (gate.creole.ResourceInstantiationException)4 FeatureMap (gate.FeatureMap)3 GateRuntimeException (gate.util.GateRuntimeException)3 InputStream (java.io.InputStream)3 XStream (com.thoughtworks.xstream.XStream)2 StaxDriver (com.thoughtworks.xstream.io.xml.StaxDriver)2 XStream11NameCoder (com.thoughtworks.xstream.io.xml.XStream11NameCoder)2 Document (gate.Document)2 ProgressListener (gate.event.ProgressListener)2 PersistenceException (gate.persist.PersistenceException)2 BomStrippingInputStreamReader (gate.util.BomStrippingInputStreamReader)2 GateException (gate.util.GateException)2 InvalidOffsetException (gate.util.InvalidOffsetException)2 XmlDocumentHandler (gate.xml.XmlDocumentHandler)2 BufferedReader (java.io.BufferedReader)2