Search in sources :

Example 31 with FeatureMap

use of gate.FeatureMap in project gate-core by GateNLP.

the class AnnotationImpl method addAnnotationListener.

 * Adds an annotation listener
public synchronized void addAnnotationListener(AnnotationListener l) {
    @SuppressWarnings("unchecked") Vector<AnnotationListener> v = annotationListeners == null ? new Vector<AnnotationListener>(2) : (Vector<AnnotationListener>) annotationListeners.clone();
    // also be propagated
    if (v.isEmpty()) {
        FeatureMap features = getFeatures();
        if (eventHandler == null)
            eventHandler = new EventsHandler();
    if (!v.contains(l)) {
        annotationListeners = v;
Also used : FeatureMap(gate.FeatureMap) AnnotationListener(gate.event.AnnotationListener)

Example 32 with FeatureMap

use of gate.FeatureMap in project gate-core by GateNLP.

the class DocumentImpl method hasOriginalContentFeatures.

// saveAnnotationSetAsXml()
   * Old method created by Cristian. Create content backward.
   * private String saveAnnotationSetAsXml(List aDumpAnnotList, boolean
   * includeFeatures){ String content = null; if (this.getContent()== null)
   * content = new String(""); else content = this.getContent().toString();
   * StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
   * if (aDumpAnnotList == null) return docContStrBuff.toString();
   * TreeMap offsets2CharsMap = new TreeMap(); HashMap annotsForOffset = new
   * HashMap(100); if (this.getContent().size().longValue() != 0){ // Fill the
   * offsets2CharsMap with all the indices where // special chars appear
   * buildEntityMapFromString(content,offsets2CharsMap); }//End if // The saving
   * alghorithm is as follows: /////////////////////////////////////////// //
   * Construct a set of annot with all IDs in asc order. // All annotations that
   * end at that offset swap their place in descending // order. For each node
   * write all the tags from left to right. // Construct the node set TreeSet
   * offsets = new TreeSet(); Iterator iter = aDumpAnnotList.iterator(); while
   * (iter.hasNext()){ Annotation annot = (Annotation);
   * offsets.add(annot.getStartNode().getOffset());
   * offsets.add(annot.getEndNode().getOffset()); if
   * (annotsForOffset.containsKey(annot.getStartNode().getOffset())) { ((List)
   * annotsForOffset.get(annot.getStartNode().getOffset())).add(annot); } else {
   * List newList = new ArrayList(10); newList.add(annot);
   * annotsForOffset.put(annot.getStartNode().getOffset(), newList); } if
   * (annotsForOffset.containsKey(annot.getEndNode().getOffset())) { ((List)
   * annotsForOffset.get(annot.getEndNode().getOffset())).add(annot); } else {
   * List newList = new ArrayList(10); newList.add(annot);
   * annotsForOffset.put(annot.getEndNode().getOffset(), newList); } }// End
   * while // ofsets is sorted in ascending order. // Iterate this set in
   * descending order and remove an offset at each // iteration while
   * (!offsets.isEmpty()){ Long offset = (Long)offsets.last(); // Remove the
   * offset from the set offsets.remove(offset); // Now, use it. // Returns a
   * list with annotations that needs to be serialized in that // offset. //
   * List annotations = getAnnotationsForOffset(aDumpAnnotList,offset); List
   * annotations = (List) annotsForOffset.get(offset); annotations =
   * getAnnotationsForOffset(annotations,offset); // Attention: the annotation
   * are serialized from left to right // StringBuffer tmpBuff = new
   * StringBuffer(""); StringBuffer tmpBuff = new StringBuffer(
   * DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
   * Stack stack = new Stack(); // Iterate through all these annotations and
   * serialize them Iterator it = annotations.iterator(); while(it.hasNext()){
   * Annotation a = (Annotation); it.remove(); // Test if a Ends at
   * offset if ( offset.equals(a.getEndNode().getOffset()) ){ // Test if a
   * Starts at offset if ( offset.equals(a.getStartNode().getOffset()) ){ //
   * Here, the annotation a Starts and Ends at the offset if ( null !=
   * a.getFeatures().get("isEmptyAndSpan") &&
   * "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){ // Assert:
   * annotation a with start == end and isEmptyAndSpan
   * tmpBuff.append(writeStartTag(a, includeFeatures)); stack.push(a); }else{ //
   * Assert annotation a with start == end and an empty tag
   * tmpBuff.append(writeEmptyTag(a)); // The annotation is removed from dumped
   * set aDumpAnnotList.remove(a); }// End if }else{ // Here the annotation a
   * Ends at the offset. // In this case empty the stack and write the end tag
   * if (!stack.isEmpty()){ while(!stack.isEmpty()){ Annotation a1 =
   * (Annotation)stack.pop(); tmpBuff.append(writeEndTag(a1)); }// End while }//
   * End if tmpBuff.append(writeEndTag(a)); }// End if }else{ // The annotation
   * a does NOT end at the offset. Let's see if it starts // at the offset if (
   * offset.equals(a.getStartNode().getOffset()) ){ // The annotation a starts
   * at the offset. // In this case empty the stack and write the end tag if
   * (!stack.isEmpty()){ while(!stack.isEmpty()){ Annotation a1 =
   * (Annotation)stack.pop(); tmpBuff.append(writeEndTag(a1)); }// End while }//
   * End if tmpBuff.append(writeStartTag(a, includeFeatures)); // The annotation
   * is removed from dumped set aDumpAnnotList.remove(a); }// End if (
   * offset.equals(a.getStartNode().getOffset()) ) }// End if (
   * offset.equals(a.getEndNode().getOffset()) ) }// End while(it.hasNext()){ //
   * In this case empty the stack and write the end tag if (!stack.isEmpty()){
   * while(!stack.isEmpty()){ Annotation a1 = (Annotation)stack.pop();
   * tmpBuff.append(writeEndTag(a1)); }// End while }// End if // Before
   * inserting tmpBuff into docContStrBuff we need to check // if there are
   * chars to be replaced and if there are, they would be // replaced. if
   * (!offsets2CharsMap.isEmpty()){ Long offsChar = (Long)
   * offsets2CharsMap.lastKey(); while( !offsets2CharsMap.isEmpty() &&
   * offsChar.intValue() >= offset.intValue()){ // Replace the char at offsChar
   * with its corresponding entity form // the entitiesMap.
   * docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
   * (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); //
   * Discard the offsChar after it was used. offsets2CharsMap.remove(offsChar); //
   * Investigate next offsChar if (!offsets2CharsMap.isEmpty()) offsChar =
   * (Long) offsets2CharsMap.lastKey(); }// End while }// End if // Insert
   * tmpBuff to the location where it belongs in docContStrBuff
   * docContStrBuff.insert(offset.intValue(),tmpBuff.toString()); }// End
   * while(!offsets.isEmpty()) // Need to replace the entities in the remaining
   * text, if there is any text // So, if there are any more items in
   * offsets2CharsMap they need to be // replaced while
   * (!offsets2CharsMap.isEmpty()){ Long offsChar = (Long)
   * offsets2CharsMap.lastKey(); // Replace the char with its entity
   * docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
   * (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); //
   * remove the offset from the map offsets2CharsMap.remove(offsChar); }// End
   * while return docContStrBuff.toString(); }// saveAnnotationSetAsXml()
 * Return true only if the document has features for original content and
 * repositioning information.
private boolean hasOriginalContentFeatures() {
    FeatureMap features = getFeatures();
    boolean result = false;
    result = (features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null) && (features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME) != null);
    return result;
Also used : FeatureMap(gate.FeatureMap)

Example 33 with FeatureMap

use of gate.FeatureMap in project gate-core by GateNLP.

the class CorpusImpl method populate.

 * Fills the provided corpus with documents extracted from the
 * provided trec file.
 * @param corpus the corpus to be populated.
 * @param singleConcatenatedFile the trec file.
 * @param documentRootElement text between this element (start and
 *          end) is considered for creating a new document.
 * @param encoding the encoding of the trec file.
 * @param numberOfDocumentsToExtract extracts the specified number of
 *          documents from the trecweb file; -1 to indicate all files.
 * @param mimeType the mime type which determines how the document is handled
 * @return total length of populated documents in the corpus in number
 *         of bytes
 * @throws
public static long populate(Corpus corpus, URL singleConcatenatedFile, String documentRootElement, String encoding, int numberOfDocumentsToExtract, String documentNamePrefix, String mimeType, boolean includeRootElement) throws IOException {
    StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
    // obtain the root element that user has provided
    // content between the start and end of root element is considered
    // for creating documents
    documentRootElement = documentRootElement.toLowerCase();
    // document name prefix could be an empty string
    documentNamePrefix = documentNamePrefix == null ? "" : documentNamePrefix.trim() + "_";
    // we start a new document when we find <documentRootElement> and
    // close it when we find </documentRootElement>
    BufferedReader br = null;
    try {
        if (encoding != null && encoding.trim().length() != 0) {
            br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), encoding, 10485760);
        } else {
            br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), 10485760);
        // reading line by line
        String line = br.readLine();
        // this is where we store document content
        StringBuilder documentString = new StringBuilder();
        // toggle switch to indicate search for start element
        boolean searchingForStartElement = true;
        // keeping count of number of documents extracted
        int count = 1;
        // length in bytes read so far (to return)
        long lengthInBytes = 0;
        // continue until reached the end of file
        while (line != null) {
            // lowercase the line in order to match documentRootElement in any case
            String lowerCasedLine = line.toLowerCase();
            // if searching for startElement?
            if (searchingForStartElement) {
                // may be its with attributes
                int index = lowerCasedLine.indexOf("<" + documentRootElement + " ");
                // may be no attributes?
                if (index == -1) {
                    index = lowerCasedLine.indexOf("<" + documentRootElement + ">");
                // skip the current line and start reading from the next line
                if (index != -1) {
                    // if found, that's the first line
                    line = line.substring(index);
                    searchingForStartElement = false;
                } else {
                    line = br.readLine();
            } else {
                // now searching for last element
                int index = lowerCasedLine.indexOf("</" + documentRootElement + ">");
                // if not found.. this is the content of a new document
                if (index == -1) {
                    documentString.append(line + "\n");
                    line = br.readLine();
                } else {
                    // found.. then end the document
                    documentString.append(line.substring(0, index + documentRootElement.length() + 3));
                    // getting ready for the next document
                    searchingForStartElement = true;
                    // here lets create a new document create the doc
                    if (sListener != null)
                        sListener.statusChanged("Creating Document Number :" + count);
                    String docName = documentNamePrefix + count + "_" + Gate.genSym();
                    String docContent = documentString.toString();
                    if (!includeRootElement)
                        docContent = docContent.substring(docContent.indexOf(">") + 1, docContent.lastIndexOf("<"));
                    FeatureMap params = Factory.newFeatureMap();
                    if (mimeType != null)
                        params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
                    params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, docContent);
                    if (encoding != null && encoding.trim().length() > 0)
                        params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
                    // calculate the length
                    lengthInBytes += docContent.getBytes().length;
                    try {
                        Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
                        if (corpus.getLRPersistenceId() != null) {
                            // persistent corpus -> unload the document
                        // already extracted requested num of documents?
                        if ((count - 1) == numberOfDocumentsToExtract)
                    } catch (Throwable t) {
                        String nl = Strings.getNl();
                        Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + "  Document name was: " + docName + nl + "  Exception was: " + t + nl + nl);
                    documentString = new StringBuilder();
                    if (sListener != null)
                        sListener.statusChanged(docName + " created!");
                    line = line.substring(index + documentRootElement.length() + 3);
                    if (line.trim().equals(""))
                        line = br.readLine();
        return lengthInBytes;
    } finally {
        if (br != null)
Also used : FeatureMap(gate.FeatureMap) BomStrippingInputStreamReader(gate.util.BomStrippingInputStreamReader) BufferedReader( StatusListener(gate.event.StatusListener) Document(gate.Document)

Example 34 with FeatureMap

use of gate.FeatureMap in project gate-core by GateNLP.

the class CorpusImpl method populate.

 * Fills the provided corpus with documents created on the fly from
 * selected files in a directory. Uses a {@link FileFilter} to select
 * which files will be used and which will be ignored. A simple file
 * filter based on extensions is provided in the Gate distribution (
 * {@link gate.util.ExtensionFileFilter}).
 * @param corpus the corpus to be populated
 * @param directory the directory from which the files will be picked.
 *          This parameter is an URL for uniformity. It needs to be a
 *          URL of type file otherwise an InvalidArgumentException
 *          will be thrown.
 * @param filter the file filter used to select files from the target
 *          directory. If the filter is <tt>null</tt> all the files
 *          will be accepted.
 * @param encoding the encoding to be used for reading the documents
 * @param recurseDirectories should the directory be parsed
 *          recursively?. If <tt>true</tt> all the files from the
 *          provided directory and all its children directories (on as
 *          many levels as necessary) will be picked if accepted by
 *          the filter otherwise the children directories will be
 *          ignored.
 * @throws if a file doesn't exist
public static void populate(Corpus corpus, URL directory, FileFilter filter, String encoding, String mimeType, boolean recurseDirectories) throws IOException {
    // check input
    if (!directory.getProtocol().equalsIgnoreCase("file"))
        throw new IllegalArgumentException("The URL provided is not of type \"file:\"!");
    File dir = Files.fileFromURL(directory);
    if (!dir.exists())
        throw new FileNotFoundException(dir.toString());
    if (!dir.isDirectory())
        throw new IllegalArgumentException(dir.getAbsolutePath() + " is not a directory!");
    File[] files;
    // populate the corpus
    if (recurseDirectories) {
        files = Files.listFilesRecursively(dir, filter);
    } else {
        files = dir.listFiles(filter);
    if (files == null) {
    // sort the files alphabetically regardless of their paths
    Arrays.sort(files, new Comparator<File>() {

        public int compare(File f1, File f2) {
            return f1.getName().compareTo(f2.getName());
    // create the GATE documents
    for (File file : files) {
        if (file.isDirectory()) {
        StatusListener sListener = (StatusListener) Gate.getListeners().get("gate.event.StatusListener");
        if (sListener != null)
            sListener.statusChanged("Reading: " + file.getName());
        String docName = file.getName() + "_" + Gate.genSym();
        FeatureMap params = Factory.newFeatureMap();
        params.put(Document.DOCUMENT_URL_PARAMETER_NAME, file.toURI().toURL());
        if (encoding != null)
            params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
        if (mimeType != null)
            params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
        try {
            Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
            if (corpus.getLRPersistenceId() != null) {
                // persistent corpus -> unload the document
        } catch (Throwable t) {
            String nl = Strings.getNl();
            Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + "  Document name was: " + docName + nl + "  Exception was: " + t + nl + nl);
        if (sListener != null)
            sListener.statusChanged(file.getName() + " read");
Also used : FeatureMap(gate.FeatureMap) FileNotFoundException( StatusListener(gate.event.StatusListener) Document(gate.Document) File(

Example 35 with FeatureMap

use of gate.FeatureMap in project gate-core by GateNLP.

the class NekoHtmlDocumentHandler method startElement.

 * Called when the parser encounters the start of an HTML element.
 * Empty elements also trigger this method, followed immediately by an
 * {@link #endElement}.
public void startElement(QName element, XMLAttributes attributes, Augmentations augs) throws XNIException {
    // deal with any outstanding character content
        Out.println("startElement: " + element.localpart);
    // rate
    if (0 == (++elements % ELEMENTS_RATE))
        fireStatusChangedEvent("Processed elements : " + elements);
    // Start of ignorable tag
    if (ignorableTags.contains(element.localpart)) {
        if (DEBUG_ELEMENTS) {
            Out.println("  ignorable tag: levels = " + ignorableTagLevels);
    // if
    // Construct a feature map from the attributes list
    FeatureMap fm = Factory.newFeatureMap();
    // Take all the attributes an put them into the feature map
    for (int i = 0; i < attributes.getLength(); i++) {
        if (DEBUG_ELEMENTS) {
            Out.println("  attribute: " + attributes.getLocalName(i) + " = " + attributes.getValue(i));
        fm.put(attributes.getLocalName(i), attributes.getValue(i));
    // Just analize the tag and add some\n chars and spaces to the
    // tmpDocContent.The reason behind is that we need to have a
    // readable form
    // for the final document.
    // create the start index of the annotation
    Long startIndex = new Long(tmpDocContent.length());
    // initialy the start index is equal with the End index
    CustomObject obj = new CustomObject(element.localpart, fm, startIndex, startIndex);
    // put it into the stack
Also used : FeatureMap(gate.FeatureMap)


FeatureMap (gate.FeatureMap)55 Document (gate.Document)15 URL ( ResourceInstantiationException (gate.creole.ResourceInstantiationException)11 File ( Resource (gate.Resource)8 GateRuntimeException (gate.util.GateRuntimeException)7 ArrayList (java.util.ArrayList)7 List (java.util.List)7 PersistenceException (gate.persist.PersistenceException)6 Annotation (gate.Annotation)5 AnnotationSet (gate.AnnotationSet)5 DataStore (gate.DataStore)5 LanguageResource (gate.LanguageResource)5 TestDocument (gate.corpora.TestDocument)4 ResourceData (gate.creole.ResourceData)4 SerialDataStore (gate.persist.SerialDataStore)4 InvalidOffsetException (gate.util.InvalidOffsetException)4 Corpus (gate.Corpus)3 ProcessingResource (gate.ProcessingResource)3