use of gate.FeatureMap in project gate-core by GateNLP.
the class AnnotationImpl method addAnnotationListener.
/**
* Adds an annotation listener
*/
@Override
public synchronized void addAnnotationListener(AnnotationListener l) {
@SuppressWarnings("unchecked") Vector<AnnotationListener> v = annotationListeners == null ? new Vector<AnnotationListener>(2) : (Vector<AnnotationListener>) annotationListeners.clone();
// also be propagated
if (v.isEmpty()) {
FeatureMap features = getFeatures();
if (eventHandler == null)
eventHandler = new EventsHandler();
features.addFeatureMapListener(eventHandler);
}
if (!v.contains(l)) {
v.addElement(l);
annotationListeners = v;
}
}
use of gate.FeatureMap in project gate-core by GateNLP.
the class DocumentImpl method hasOriginalContentFeatures.
// saveAnnotationSetAsXml()
/*
* Old method created by Cristian. Create content backward.
*
* private String saveAnnotationSetAsXml(List aDumpAnnotList, boolean
* includeFeatures){ String content = null; if (this.getContent()== null)
* content = new String(""); else content = this.getContent().toString();
* StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
* if (aDumpAnnotList == null) return docContStrBuff.toString();
*
* TreeMap offsets2CharsMap = new TreeMap(); HashMap annotsForOffset = new
* HashMap(100); if (this.getContent().size().longValue() != 0){ // Fill the
* offsets2CharsMap with all the indices where // special chars appear
* buildEntityMapFromString(content,offsets2CharsMap); }//End if // The saving
* alghorithm is as follows: /////////////////////////////////////////// //
* Construct a set of annot with all IDs in asc order. // All annotations that
* end at that offset swap their place in descending // order. For each node
* write all the tags from left to right. // Construct the node set TreeSet
* offsets = new TreeSet(); Iterator iter = aDumpAnnotList.iterator(); while
* (iter.hasNext()){ Annotation annot = (Annotation) iter.next();
* offsets.add(annot.getStartNode().getOffset());
* offsets.add(annot.getEndNode().getOffset()); if
* (annotsForOffset.containsKey(annot.getStartNode().getOffset())) { ((List)
* annotsForOffset.get(annot.getStartNode().getOffset())).add(annot); } else {
* List newList = new ArrayList(10); newList.add(annot);
* annotsForOffset.put(annot.getStartNode().getOffset(), newList); } if
* (annotsForOffset.containsKey(annot.getEndNode().getOffset())) { ((List)
* annotsForOffset.get(annot.getEndNode().getOffset())).add(annot); } else {
* List newList = new ArrayList(10); newList.add(annot);
* annotsForOffset.put(annot.getEndNode().getOffset(), newList); } }// End
* while // ofsets is sorted in ascending order. // Iterate this set in
* descending order and remove an offset at each // iteration while
* (!offsets.isEmpty()){ Long offset = (Long)offsets.last(); // Remove the
* offset from the set offsets.remove(offset); // Now, use it. // Returns a
* list with annotations that needs to be serialized in that // offset. //
* List annotations = getAnnotationsForOffset(aDumpAnnotList,offset); List
* annotations = (List) annotsForOffset.get(offset); annotations =
* getAnnotationsForOffset(annotations,offset); // Attention: the annotation
* are serialized from left to right // StringBuffer tmpBuff = new
* StringBuffer(""); StringBuffer tmpBuff = new StringBuffer(
* DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
* Stack stack = new Stack(); // Iterate through all these annotations and
* serialize them Iterator it = annotations.iterator(); while(it.hasNext()){
* Annotation a = (Annotation) it.next(); it.remove(); // Test if a Ends at
* offset if ( offset.equals(a.getEndNode().getOffset()) ){ // Test if a
* Starts at offset if ( offset.equals(a.getStartNode().getOffset()) ){ //
* Here, the annotation a Starts and Ends at the offset if ( null !=
* a.getFeatures().get("isEmptyAndSpan") &&
* "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){ // Assert:
* annotation a with start == end and isEmptyAndSpan
* tmpBuff.append(writeStartTag(a, includeFeatures)); stack.push(a); }else{ //
* Assert annotation a with start == end and an empty tag
* tmpBuff.append(writeEmptyTag(a)); // The annotation is removed from dumped
* set aDumpAnnotList.remove(a); }// End if }else{ // Here the annotation a
* Ends at the offset. // In this case empty the stack and write the end tag
* if (!stack.isEmpty()){ while(!stack.isEmpty()){ Annotation a1 =
* (Annotation)stack.pop(); tmpBuff.append(writeEndTag(a1)); }// End while }//
* End if tmpBuff.append(writeEndTag(a)); }// End if }else{ // The annotation
* a does NOT end at the offset. Let's see if it starts // at the offset if (
* offset.equals(a.getStartNode().getOffset()) ){ // The annotation a starts
* at the offset. // In this case empty the stack and write the end tag if
* (!stack.isEmpty()){ while(!stack.isEmpty()){ Annotation a1 =
* (Annotation)stack.pop(); tmpBuff.append(writeEndTag(a1)); }// End while }//
* End if tmpBuff.append(writeStartTag(a, includeFeatures)); // The annotation
* is removed from dumped set aDumpAnnotList.remove(a); }// End if (
* offset.equals(a.getStartNode().getOffset()) ) }// End if (
* offset.equals(a.getEndNode().getOffset()) ) }// End while(it.hasNext()){ //
* In this case empty the stack and write the end tag if (!stack.isEmpty()){
* while(!stack.isEmpty()){ Annotation a1 = (Annotation)stack.pop();
* tmpBuff.append(writeEndTag(a1)); }// End while }// End if // Before
* inserting tmpBuff into docContStrBuff we need to check // if there are
* chars to be replaced and if there are, they would be // replaced. if
* (!offsets2CharsMap.isEmpty()){ Long offsChar = (Long)
* offsets2CharsMap.lastKey(); while( !offsets2CharsMap.isEmpty() &&
* offsChar.intValue() >= offset.intValue()){ // Replace the char at offsChar
* with its corresponding entity form // the entitiesMap.
* docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
* (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); //
* Discard the offsChar after it was used. offsets2CharsMap.remove(offsChar); //
* Investigate next offsChar if (!offsets2CharsMap.isEmpty()) offsChar =
* (Long) offsets2CharsMap.lastKey(); }// End while }// End if // Insert
* tmpBuff to the location where it belongs in docContStrBuff
* docContStrBuff.insert(offset.intValue(),tmpBuff.toString()); }// End
* while(!offsets.isEmpty()) // Need to replace the entities in the remaining
* text, if there is any text // So, if there are any more items in
* offsets2CharsMap they need to be // replaced while
* (!offsets2CharsMap.isEmpty()){ Long offsChar = (Long)
* offsets2CharsMap.lastKey(); // Replace the char with its entity
* docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
* (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); //
* remove the offset from the map offsets2CharsMap.remove(offsChar); }// End
* while return docContStrBuff.toString(); }// saveAnnotationSetAsXml()
*/
/**
* Return true only if the document has features for original content and
* repositioning information.
*/
private boolean hasOriginalContentFeatures() {
FeatureMap features = getFeatures();
boolean result = false;
result = (features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null) && (features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME) != null);
return result;
}
use of gate.FeatureMap in project gate-core by GateNLP.
the class CorpusImpl method populate.
/**
* Fills the provided corpus with documents extracted from the
* provided trec file.
*
* @param corpus the corpus to be populated.
* @param singleConcatenatedFile the trec file.
* @param documentRootElement text between this element (start and
* end) is considered for creating a new document.
* @param encoding the encoding of the trec file.
* @param numberOfDocumentsToExtract extracts the specified number of
* documents from the trecweb file; -1 to indicate all files.
* @param mimeType the mime type which determines how the document is handled
* @return total length of populated documents in the corpus in number
* of bytes
* @throws java.io.IOException
*/
public static long populate(Corpus corpus, URL singleConcatenatedFile, String documentRootElement, String encoding, int numberOfDocumentsToExtract, String documentNamePrefix, String mimeType, boolean includeRootElement) throws IOException {
StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
// obtain the root element that user has provided
// content between the start and end of root element is considered
// for creating documents
documentRootElement = documentRootElement.toLowerCase();
// document name prefix could be an empty string
documentNamePrefix = documentNamePrefix == null ? "" : documentNamePrefix.trim() + "_";
// we start a new document when we find <documentRootElement> and
// close it when we find </documentRootElement>
BufferedReader br = null;
try {
if (encoding != null && encoding.trim().length() != 0) {
br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), encoding, 10485760);
} else {
br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), 10485760);
}
// reading line by line
String line = br.readLine();
// this is where we store document content
StringBuilder documentString = new StringBuilder();
// toggle switch to indicate search for start element
boolean searchingForStartElement = true;
// keeping count of number of documents extracted
int count = 1;
// length in bytes read so far (to return)
long lengthInBytes = 0;
// continue until reached the end of file
while (line != null) {
// lowercase the line in order to match documentRootElement in any case
String lowerCasedLine = line.toLowerCase();
// if searching for startElement?
if (searchingForStartElement) {
// may be its with attributes
int index = lowerCasedLine.indexOf("<" + documentRootElement + " ");
// may be no attributes?
if (index == -1) {
index = lowerCasedLine.indexOf("<" + documentRootElement + ">");
}
// skip the current line and start reading from the next line
if (index != -1) {
// if found, that's the first line
line = line.substring(index);
searchingForStartElement = false;
} else {
line = br.readLine();
}
} else {
// now searching for last element
int index = lowerCasedLine.indexOf("</" + documentRootElement + ">");
// if not found.. this is the content of a new document
if (index == -1) {
documentString.append(line + "\n");
line = br.readLine();
} else {
// found.. then end the document
documentString.append(line.substring(0, index + documentRootElement.length() + 3));
// getting ready for the next document
searchingForStartElement = true;
// here lets create a new document create the doc
if (sListener != null)
sListener.statusChanged("Creating Document Number :" + count);
String docName = documentNamePrefix + count + "_" + Gate.genSym();
String docContent = documentString.toString();
if (!includeRootElement)
docContent = docContent.substring(docContent.indexOf(">") + 1, docContent.lastIndexOf("<"));
FeatureMap params = Factory.newFeatureMap();
if (mimeType != null)
params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, docContent);
if (encoding != null && encoding.trim().length() > 0)
params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
// calculate the length
lengthInBytes += docContent.getBytes().length;
try {
Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
count++;
corpus.add(doc);
if (corpus.getLRPersistenceId() != null) {
// persistent corpus -> unload the document
corpus.unloadDocument(doc);
Factory.deleteResource(doc);
}
// already extracted requested num of documents?
if ((count - 1) == numberOfDocumentsToExtract)
break;
} catch (Throwable t) {
String nl = Strings.getNl();
Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + " Document name was: " + docName + nl + " Exception was: " + t + nl + nl);
t.printStackTrace();
}
documentString = new StringBuilder();
if (sListener != null)
sListener.statusChanged(docName + " created!");
line = line.substring(index + documentRootElement.length() + 3);
if (line.trim().equals(""))
line = br.readLine();
}
}
}
return lengthInBytes;
} finally {
if (br != null)
br.close();
}
}
use of gate.FeatureMap in project gate-core by GateNLP.
the class CorpusImpl method populate.
/**
* Fills the provided corpus with documents created on the fly from
* selected files in a directory. Uses a {@link FileFilter} to select
* which files will be used and which will be ignored. A simple file
* filter based on extensions is provided in the Gate distribution (
* {@link gate.util.ExtensionFileFilter}).
*
* @param corpus the corpus to be populated
* @param directory the directory from which the files will be picked.
* This parameter is an URL for uniformity. It needs to be a
* URL of type file otherwise an InvalidArgumentException
* will be thrown.
* @param filter the file filter used to select files from the target
* directory. If the filter is <tt>null</tt> all the files
* will be accepted.
* @param encoding the encoding to be used for reading the documents
* @param recurseDirectories should the directory be parsed
* recursively?. If <tt>true</tt> all the files from the
* provided directory and all its children directories (on as
* many levels as necessary) will be picked if accepted by
* the filter otherwise the children directories will be
* ignored.
* @throws java.io.IOException if a file doesn't exist
*/
public static void populate(Corpus corpus, URL directory, FileFilter filter, String encoding, String mimeType, boolean recurseDirectories) throws IOException {
// check input
if (!directory.getProtocol().equalsIgnoreCase("file"))
throw new IllegalArgumentException("The URL provided is not of type \"file:\"!");
File dir = Files.fileFromURL(directory);
if (!dir.exists())
throw new FileNotFoundException(dir.toString());
if (!dir.isDirectory())
throw new IllegalArgumentException(dir.getAbsolutePath() + " is not a directory!");
File[] files;
// populate the corpus
if (recurseDirectories) {
files = Files.listFilesRecursively(dir, filter);
} else {
files = dir.listFiles(filter);
}
if (files == null) {
return;
}
// sort the files alphabetically regardless of their paths
Arrays.sort(files, new Comparator<File>() {
@Override
public int compare(File f1, File f2) {
return f1.getName().compareTo(f2.getName());
}
});
// create the GATE documents
for (File file : files) {
if (file.isDirectory()) {
continue;
}
StatusListener sListener = (StatusListener) Gate.getListeners().get("gate.event.StatusListener");
if (sListener != null)
sListener.statusChanged("Reading: " + file.getName());
String docName = file.getName() + "_" + Gate.genSym();
FeatureMap params = Factory.newFeatureMap();
params.put(Document.DOCUMENT_URL_PARAMETER_NAME, file.toURI().toURL());
if (encoding != null)
params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
if (mimeType != null)
params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
try {
Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
corpus.add(doc);
if (corpus.getLRPersistenceId() != null) {
// persistent corpus -> unload the document
corpus.unloadDocument(doc);
Factory.deleteResource(doc);
}
} catch (Throwable t) {
String nl = Strings.getNl();
Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + " Document name was: " + docName + nl + " Exception was: " + t + nl + nl);
t.printStackTrace();
}
if (sListener != null)
sListener.statusChanged(file.getName() + " read");
}
}
use of gate.FeatureMap in project gate-core by GateNLP.
the class NekoHtmlDocumentHandler method startElement.
/**
* Called when the parser encounters the start of an HTML element.
* Empty elements also trigger this method, followed immediately by an
* {@link #endElement}.
*/
@Override
public void startElement(QName element, XMLAttributes attributes, Augmentations augs) throws XNIException {
// deal with any outstanding character content
charactersAction();
if (DEBUG_ELEMENTS) {
Out.println("startElement: " + element.localpart);
}
// rate
if (0 == (++elements % ELEMENTS_RATE))
fireStatusChangedEvent("Processed elements : " + elements);
// Start of ignorable tag
if (ignorableTags.contains(element.localpart)) {
ignorableTagLevels++;
if (DEBUG_ELEMENTS) {
Out.println(" ignorable tag: levels = " + ignorableTagLevels);
}
}
// if
// Construct a feature map from the attributes list
FeatureMap fm = Factory.newFeatureMap();
// Take all the attributes an put them into the feature map
for (int i = 0; i < attributes.getLength(); i++) {
if (DEBUG_ELEMENTS) {
Out.println(" attribute: " + attributes.getLocalName(i) + " = " + attributes.getValue(i));
}
fm.put(attributes.getLocalName(i), attributes.getValue(i));
}
// Just analize the tag and add some\n chars and spaces to the
// tmpDocContent.The reason behind is that we need to have a
// readable form
// for the final document.
customizeAppearanceOfDocumentWithStartTag(element.localpart);
// create the start index of the annotation
Long startIndex = new Long(tmpDocContent.length());
// initialy the start index is equal with the End index
CustomObject obj = new CustomObject(element.localpart, fm, startIndex, startIndex);
// put it into the stack
stack.push(obj);
}
Aggregations