Search in sources :

Example 31 with FeatureMap

use of gate.FeatureMap in project gate-core by GateNLP.

the class AnnotationImpl method addAnnotationListener.

/**
 * Adds an annotation listener
 */
@Override
public synchronized void addAnnotationListener(AnnotationListener l) {
    @SuppressWarnings("unchecked") Vector<AnnotationListener> v = annotationListeners == null ? new Vector<AnnotationListener>(2) : (Vector<AnnotationListener>) annotationListeners.clone();
    // also be propagated
    if (v.isEmpty()) {
        FeatureMap features = getFeatures();
        if (eventHandler == null)
            eventHandler = new EventsHandler();
        features.addFeatureMapListener(eventHandler);
    }
    if (!v.contains(l)) {
        v.addElement(l);
        annotationListeners = v;
    }
}
Also used : FeatureMap(gate.FeatureMap) AnnotationListener(gate.event.AnnotationListener)

Example 32 with FeatureMap

use of gate.FeatureMap in project gate-core by GateNLP.

the class DocumentImpl method hasOriginalContentFeatures.

// saveAnnotationSetAsXml()
/*
   * Old method created by Cristian. Create content backward.
   * 
   * private String saveAnnotationSetAsXml(List aDumpAnnotList, boolean
   * includeFeatures){ String content = null; if (this.getContent()== null)
   * content = new String(""); else content = this.getContent().toString();
   * StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
   * if (aDumpAnnotList == null) return docContStrBuff.toString();
   * 
   * TreeMap offsets2CharsMap = new TreeMap(); HashMap annotsForOffset = new
   * HashMap(100); if (this.getContent().size().longValue() != 0){ // Fill the
   * offsets2CharsMap with all the indices where // special chars appear
   * buildEntityMapFromString(content,offsets2CharsMap); }//End if // The saving
   * alghorithm is as follows: /////////////////////////////////////////// //
   * Construct a set of annot with all IDs in asc order. // All annotations that
   * end at that offset swap their place in descending // order. For each node
   * write all the tags from left to right. // Construct the node set TreeSet
   * offsets = new TreeSet(); Iterator iter = aDumpAnnotList.iterator(); while
   * (iter.hasNext()){ Annotation annot = (Annotation) iter.next();
   * offsets.add(annot.getStartNode().getOffset());
   * offsets.add(annot.getEndNode().getOffset()); if
   * (annotsForOffset.containsKey(annot.getStartNode().getOffset())) { ((List)
   * annotsForOffset.get(annot.getStartNode().getOffset())).add(annot); } else {
   * List newList = new ArrayList(10); newList.add(annot);
   * annotsForOffset.put(annot.getStartNode().getOffset(), newList); } if
   * (annotsForOffset.containsKey(annot.getEndNode().getOffset())) { ((List)
   * annotsForOffset.get(annot.getEndNode().getOffset())).add(annot); } else {
   * List newList = new ArrayList(10); newList.add(annot);
   * annotsForOffset.put(annot.getEndNode().getOffset(), newList); } }// End
   * while // ofsets is sorted in ascending order. // Iterate this set in
   * descending order and remove an offset at each // iteration while
   * (!offsets.isEmpty()){ Long offset = (Long)offsets.last(); // Remove the
   * offset from the set offsets.remove(offset); // Now, use it. // Returns a
   * list with annotations that needs to be serialized in that // offset. //
   * List annotations = getAnnotationsForOffset(aDumpAnnotList,offset); List
   * annotations = (List) annotsForOffset.get(offset); annotations =
   * getAnnotationsForOffset(annotations,offset); // Attention: the annotation
   * are serialized from left to right // StringBuffer tmpBuff = new
   * StringBuffer(""); StringBuffer tmpBuff = new StringBuffer(
   * DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
   * Stack stack = new Stack(); // Iterate through all these annotations and
   * serialize them Iterator it = annotations.iterator(); while(it.hasNext()){
   * Annotation a = (Annotation) it.next(); it.remove(); // Test if a Ends at
   * offset if ( offset.equals(a.getEndNode().getOffset()) ){ // Test if a
   * Starts at offset if ( offset.equals(a.getStartNode().getOffset()) ){ //
   * Here, the annotation a Starts and Ends at the offset if ( null !=
   * a.getFeatures().get("isEmptyAndSpan") &&
   * "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){ // Assert:
   * annotation a with start == end and isEmptyAndSpan
   * tmpBuff.append(writeStartTag(a, includeFeatures)); stack.push(a); }else{ //
   * Assert annotation a with start == end and an empty tag
   * tmpBuff.append(writeEmptyTag(a)); // The annotation is removed from dumped
   * set aDumpAnnotList.remove(a); }// End if }else{ // Here the annotation a
   * Ends at the offset. // In this case empty the stack and write the end tag
   * if (!stack.isEmpty()){ while(!stack.isEmpty()){ Annotation a1 =
   * (Annotation)stack.pop(); tmpBuff.append(writeEndTag(a1)); }// End while }//
   * End if tmpBuff.append(writeEndTag(a)); }// End if }else{ // The annotation
   * a does NOT end at the offset. Let's see if it starts // at the offset if (
   * offset.equals(a.getStartNode().getOffset()) ){ // The annotation a starts
   * at the offset. // In this case empty the stack and write the end tag if
   * (!stack.isEmpty()){ while(!stack.isEmpty()){ Annotation a1 =
   * (Annotation)stack.pop(); tmpBuff.append(writeEndTag(a1)); }// End while }//
   * End if tmpBuff.append(writeStartTag(a, includeFeatures)); // The annotation
   * is removed from dumped set aDumpAnnotList.remove(a); }// End if (
   * offset.equals(a.getStartNode().getOffset()) ) }// End if (
   * offset.equals(a.getEndNode().getOffset()) ) }// End while(it.hasNext()){ //
   * In this case empty the stack and write the end tag if (!stack.isEmpty()){
   * while(!stack.isEmpty()){ Annotation a1 = (Annotation)stack.pop();
   * tmpBuff.append(writeEndTag(a1)); }// End while }// End if // Before
   * inserting tmpBuff into docContStrBuff we need to check // if there are
   * chars to be replaced and if there are, they would be // replaced. if
   * (!offsets2CharsMap.isEmpty()){ Long offsChar = (Long)
   * offsets2CharsMap.lastKey(); while( !offsets2CharsMap.isEmpty() &&
   * offsChar.intValue() >= offset.intValue()){ // Replace the char at offsChar
   * with its corresponding entity form // the entitiesMap.
   * docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
   * (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); //
   * Discard the offsChar after it was used. offsets2CharsMap.remove(offsChar); //
   * Investigate next offsChar if (!offsets2CharsMap.isEmpty()) offsChar =
   * (Long) offsets2CharsMap.lastKey(); }// End while }// End if // Insert
   * tmpBuff to the location where it belongs in docContStrBuff
   * docContStrBuff.insert(offset.intValue(),tmpBuff.toString()); }// End
   * while(!offsets.isEmpty()) // Need to replace the entities in the remaining
   * text, if there is any text // So, if there are any more items in
   * offsets2CharsMap they need to be // replaced while
   * (!offsets2CharsMap.isEmpty()){ Long offsChar = (Long)
   * offsets2CharsMap.lastKey(); // Replace the char with its entity
   * docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
   * (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); //
   * remove the offset from the map offsets2CharsMap.remove(offsChar); }// End
   * while return docContStrBuff.toString(); }// saveAnnotationSetAsXml()
   */
/**
 * Return true only if the document has features for original content and
 * repositioning information.
 */
private boolean hasOriginalContentFeatures() {
    FeatureMap features = getFeatures();
    boolean result = false;
    result = (features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null) && (features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME) != null);
    return result;
}
Also used : FeatureMap(gate.FeatureMap)

Example 33 with FeatureMap

use of gate.FeatureMap in project gate-core by GateNLP.

the class CorpusImpl method populate.

/**
 * Fills the provided corpus with documents extracted from the
 * provided trec file.
 *
 * @param corpus the corpus to be populated.
 * @param singleConcatenatedFile the trec file.
 * @param documentRootElement text between this element (start and
 *          end) is considered for creating a new document.
 * @param encoding the encoding of the trec file.
 * @param numberOfDocumentsToExtract extracts the specified number of
 *          documents from the trecweb file; -1 to indicate all files.
 * @param mimeType the mime type which determines how the document is handled
 * @return total length of populated documents in the corpus in number
 *         of bytes
 * @throws java.io.IOException
 */
public static long populate(Corpus corpus, URL singleConcatenatedFile, String documentRootElement, String encoding, int numberOfDocumentsToExtract, String documentNamePrefix, String mimeType, boolean includeRootElement) throws IOException {
    StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
    // obtain the root element that user has provided
    // content between the start and end of root element is considered
    // for creating documents
    documentRootElement = documentRootElement.toLowerCase();
    // document name prefix could be an empty string
    documentNamePrefix = documentNamePrefix == null ? "" : documentNamePrefix.trim() + "_";
    // we start a new document when we find <documentRootElement> and
    // close it when we find </documentRootElement>
    BufferedReader br = null;
    try {
        if (encoding != null && encoding.trim().length() != 0) {
            br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), encoding, 10485760);
        } else {
            br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), 10485760);
        }
        // reading line by line
        String line = br.readLine();
        // this is where we store document content
        StringBuilder documentString = new StringBuilder();
        // toggle switch to indicate search for start element
        boolean searchingForStartElement = true;
        // keeping count of number of documents extracted
        int count = 1;
        // length in bytes read so far (to return)
        long lengthInBytes = 0;
        // continue until reached the end of file
        while (line != null) {
            // lowercase the line in order to match documentRootElement in any case
            String lowerCasedLine = line.toLowerCase();
            // if searching for startElement?
            if (searchingForStartElement) {
                // may be its with attributes
                int index = lowerCasedLine.indexOf("<" + documentRootElement + " ");
                // may be no attributes?
                if (index == -1) {
                    index = lowerCasedLine.indexOf("<" + documentRootElement + ">");
                }
                // skip the current line and start reading from the next line
                if (index != -1) {
                    // if found, that's the first line
                    line = line.substring(index);
                    searchingForStartElement = false;
                } else {
                    line = br.readLine();
                }
            } else {
                // now searching for last element
                int index = lowerCasedLine.indexOf("</" + documentRootElement + ">");
                // if not found.. this is the content of a new document
                if (index == -1) {
                    documentString.append(line + "\n");
                    line = br.readLine();
                } else {
                    // found.. then end the document
                    documentString.append(line.substring(0, index + documentRootElement.length() + 3));
                    // getting ready for the next document
                    searchingForStartElement = true;
                    // here lets create a new document create the doc
                    if (sListener != null)
                        sListener.statusChanged("Creating Document Number :" + count);
                    String docName = documentNamePrefix + count + "_" + Gate.genSym();
                    String docContent = documentString.toString();
                    if (!includeRootElement)
                        docContent = docContent.substring(docContent.indexOf(">") + 1, docContent.lastIndexOf("<"));
                    FeatureMap params = Factory.newFeatureMap();
                    if (mimeType != null)
                        params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
                    params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, docContent);
                    if (encoding != null && encoding.trim().length() > 0)
                        params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
                    // calculate the length
                    lengthInBytes += docContent.getBytes().length;
                    try {
                        Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
                        count++;
                        corpus.add(doc);
                        if (corpus.getLRPersistenceId() != null) {
                            // persistent corpus -> unload the document
                            corpus.unloadDocument(doc);
                            Factory.deleteResource(doc);
                        }
                        // already extracted requested num of documents?
                        if ((count - 1) == numberOfDocumentsToExtract)
                            break;
                    } catch (Throwable t) {
                        String nl = Strings.getNl();
                        Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + "  Document name was: " + docName + nl + "  Exception was: " + t + nl + nl);
                        t.printStackTrace();
                    }
                    documentString = new StringBuilder();
                    if (sListener != null)
                        sListener.statusChanged(docName + " created!");
                    line = line.substring(index + documentRootElement.length() + 3);
                    if (line.trim().equals(""))
                        line = br.readLine();
                }
            }
        }
        return lengthInBytes;
    } finally {
        if (br != null)
            br.close();
    }
}
Also used : FeatureMap(gate.FeatureMap) BomStrippingInputStreamReader(gate.util.BomStrippingInputStreamReader) BufferedReader(java.io.BufferedReader) StatusListener(gate.event.StatusListener) Document(gate.Document)

Example 34 with FeatureMap

use of gate.FeatureMap in project gate-core by GateNLP.

the class CorpusImpl method populate.

/**
 * Fills the provided corpus with documents created on the fly from
 * selected files in a directory. Uses a {@link FileFilter} to select
 * which files will be used and which will be ignored. A simple file
 * filter based on extensions is provided in the Gate distribution (
 * {@link gate.util.ExtensionFileFilter}).
 *
 * @param corpus the corpus to be populated
 * @param directory the directory from which the files will be picked.
 *          This parameter is an URL for uniformity. It needs to be a
 *          URL of type file otherwise an InvalidArgumentException
 *          will be thrown.
 * @param filter the file filter used to select files from the target
 *          directory. If the filter is <tt>null</tt> all the files
 *          will be accepted.
 * @param encoding the encoding to be used for reading the documents
 * @param recurseDirectories should the directory be parsed
 *          recursively?. If <tt>true</tt> all the files from the
 *          provided directory and all its children directories (on as
 *          many levels as necessary) will be picked if accepted by
 *          the filter otherwise the children directories will be
 *          ignored.
 * @throws java.io.IOException if a file doesn't exist
 */
public static void populate(Corpus corpus, URL directory, FileFilter filter, String encoding, String mimeType, boolean recurseDirectories) throws IOException {
    // check input
    if (!directory.getProtocol().equalsIgnoreCase("file"))
        throw new IllegalArgumentException("The URL provided is not of type \"file:\"!");
    File dir = Files.fileFromURL(directory);
    if (!dir.exists())
        throw new FileNotFoundException(dir.toString());
    if (!dir.isDirectory())
        throw new IllegalArgumentException(dir.getAbsolutePath() + " is not a directory!");
    File[] files;
    // populate the corpus
    if (recurseDirectories) {
        files = Files.listFilesRecursively(dir, filter);
    } else {
        files = dir.listFiles(filter);
    }
    if (files == null) {
        return;
    }
    // sort the files alphabetically regardless of their paths
    Arrays.sort(files, new Comparator<File>() {

        @Override
        public int compare(File f1, File f2) {
            return f1.getName().compareTo(f2.getName());
        }
    });
    // create the GATE documents
    for (File file : files) {
        if (file.isDirectory()) {
            continue;
        }
        StatusListener sListener = (StatusListener) Gate.getListeners().get("gate.event.StatusListener");
        if (sListener != null)
            sListener.statusChanged("Reading: " + file.getName());
        String docName = file.getName() + "_" + Gate.genSym();
        FeatureMap params = Factory.newFeatureMap();
        params.put(Document.DOCUMENT_URL_PARAMETER_NAME, file.toURI().toURL());
        if (encoding != null)
            params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
        if (mimeType != null)
            params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
        try {
            Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
            corpus.add(doc);
            if (corpus.getLRPersistenceId() != null) {
                // persistent corpus -> unload the document
                corpus.unloadDocument(doc);
                Factory.deleteResource(doc);
            }
        } catch (Throwable t) {
            String nl = Strings.getNl();
            Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + "  Document name was: " + docName + nl + "  Exception was: " + t + nl + nl);
            t.printStackTrace();
        }
        if (sListener != null)
            sListener.statusChanged(file.getName() + " read");
    }
}
Also used : FeatureMap(gate.FeatureMap) FileNotFoundException(java.io.FileNotFoundException) StatusListener(gate.event.StatusListener) Document(gate.Document) File(java.io.File)

Example 35 with FeatureMap

use of gate.FeatureMap in project gate-core by GateNLP.

the class NekoHtmlDocumentHandler method startElement.

/**
 * Called when the parser encounters the start of an HTML element.
 * Empty elements also trigger this method, followed immediately by an
 * {@link #endElement}.
 */
@Override
public void startElement(QName element, XMLAttributes attributes, Augmentations augs) throws XNIException {
    // deal with any outstanding character content
    charactersAction();
    if (DEBUG_ELEMENTS) {
        Out.println("startElement: " + element.localpart);
    }
    // rate
    if (0 == (++elements % ELEMENTS_RATE))
        fireStatusChangedEvent("Processed elements : " + elements);
    // Start of ignorable tag
    if (ignorableTags.contains(element.localpart)) {
        ignorableTagLevels++;
        if (DEBUG_ELEMENTS) {
            Out.println("  ignorable tag: levels = " + ignorableTagLevels);
        }
    }
    // if
    // Construct a feature map from the attributes list
    FeatureMap fm = Factory.newFeatureMap();
    // Take all the attributes an put them into the feature map
    for (int i = 0; i < attributes.getLength(); i++) {
        if (DEBUG_ELEMENTS) {
            Out.println("  attribute: " + attributes.getLocalName(i) + " = " + attributes.getValue(i));
        }
        fm.put(attributes.getLocalName(i), attributes.getValue(i));
    }
    // Just analize the tag and add some\n chars and spaces to the
    // tmpDocContent.The reason behind is that we need to have a
    // readable form
    // for the final document.
    customizeAppearanceOfDocumentWithStartTag(element.localpart);
    // create the start index of the annotation
    Long startIndex = new Long(tmpDocContent.length());
    // initialy the start index is equal with the End index
    CustomObject obj = new CustomObject(element.localpart, fm, startIndex, startIndex);
    // put it into the stack
    stack.push(obj);
}
Also used : FeatureMap(gate.FeatureMap)

Aggregations

FeatureMap (gate.FeatureMap)55 Document (gate.Document)15 URL (java.net.URL)14 ResourceInstantiationException (gate.creole.ResourceInstantiationException)11 File (java.io.File)10 Resource (gate.Resource)8 GateRuntimeException (gate.util.GateRuntimeException)7 ArrayList (java.util.ArrayList)7 List (java.util.List)7 PersistenceException (gate.persist.PersistenceException)6 Annotation (gate.Annotation)5 AnnotationSet (gate.AnnotationSet)5 DataStore (gate.DataStore)5 LanguageResource (gate.LanguageResource)5 TestDocument (gate.corpora.TestDocument)4 ResourceData (gate.creole.ResourceData)4 SerialDataStore (gate.persist.SerialDataStore)4 InvalidOffsetException (gate.util.InvalidOffsetException)4 Corpus (gate.Corpus)3 ProcessingResource (gate.ProcessingResource)3