use of gate.util.InvalidOffsetException in project gate-core by GateNLP.
the class LuceneDocument method createDocuments.
/**
* Given an instance of Gate Document, it converts it into the format that
* lucene can understand and can store in its indexes. This method also stores
* the tokenStream on the disk in order to retrieve it at the time of
* searching
*/
public List<Document> createDocuments(String corpusPersistenceID, gate.Document gateDoc, String documentID, List<String> annotSetsToInclude, List<String> annotSetsToExclude, List<String> featuresToInclude, List<String> featuresToExclude, String indexLocation, String baseTokenAnnotationType, Boolean createTokensAutomatically, String indexUnitAnnotationType) {
if (baseTokenAnnotationType != null)
baseTokenAnnotationType = baseTokenAnnotationType.trim();
List<Document> toReturnBack = new ArrayList<Document>();
List<String> annotSetsToIndex = new ArrayList<String>();
// about annotation sets to exclude
if (annotSetsToInclude.size() > 0) {
annotSetsToIndex = annotSetsToInclude;
// if there's only one annotation to index, we don't need to
// create a MergeSet
// if(annotSetsToIndex.size() == 1) createMergeSet = false;
} else if (annotSetsToExclude.size() > 0) {
// if there were no annotation sets to include, check if user has
// provided any annotation sets to exclude
// if so, we need to index all annotation sets but provided in the
// annotationsetstoexclude list
Set<String> namedAnnotSets = new HashSet<String>();
if (gateDoc.getNamedAnnotationSets() != null && gateDoc.getNamedAnnotationSets().keySet() != null) {
namedAnnotSets = gateDoc.getNamedAnnotationSets().keySet();
}
for (String setName : namedAnnotSets) {
if (annotSetsToExclude.contains(setName))
continue;
annotSetsToIndex.add(setName);
}
if (!annotSetsToExclude.contains(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
annotSetsToIndex.add(Constants.DEFAULT_ANNOTATION_SET_NAME);
}
} else {
// if both annotation sets to include and annotation sets to
// exclude are empty
// we need to index all annotation sets
Set<String> namedAnnotSets = new HashSet<String>();
if (gateDoc.getNamedAnnotationSets() != null && gateDoc.getNamedAnnotationSets().keySet() != null) {
namedAnnotSets = gateDoc.getNamedAnnotationSets().keySet();
}
for (String setName : namedAnnotSets) {
annotSetsToIndex.add(setName);
}
annotSetsToIndex.add(Constants.DEFAULT_ANNOTATION_SET_NAME);
}
// lets find out the annotation set that contains tokens in it
AnnotationSet baseTokenAnnotationSet = null;
// search in annotation sets to find out which of them has the
// baseTokenAnnotationType annotations
// initially this is set to false
boolean searchBaseTokensInAllAnnotationSets = false;
boolean searchIndexUnitInAllAnnotationSets = false;
// this variable tells whether we want to create manual tokens or
// not
boolean createManualTokens = false;
// lets check if user's input is setName.basetokenAnnotationType
int index = -1;
if (baseTokenAnnotationType != null && baseTokenAnnotationType.length() > 0)
index = baseTokenAnnotationType.lastIndexOf('.');
// basetokenAnnotationType
if (index >= 0) {
// set name
String setName = baseTokenAnnotationType.substring(0, index);
// token type
baseTokenAnnotationType = baseTokenAnnotationType.substring(index + 1, baseTokenAnnotationType.length());
// annotation set
if (setName.equals(Constants.DEFAULT_ANNOTATION_SET_NAME))
baseTokenAnnotationSet = gateDoc.getAnnotations().get(baseTokenAnnotationType);
else
baseTokenAnnotationSet = gateDoc.getAnnotations(setName).get(baseTokenAnnotationType);
// base token annotation type
if (baseTokenAnnotationSet == null || baseTokenAnnotationSet.size() == 0) {
System.err.println("Base Tokens " + baseTokenAnnotationType + " counldn't be found under the specified annotation set " + setName + "\n searching them in other annotation sets");
searchBaseTokensInAllAnnotationSets = true;
}
} else {
// either baseTokenAnnotation type is null or user hasn't provided
// any annotaiton set name
// so we search in all annotation sets
searchBaseTokensInAllAnnotationSets = true;
}
if (baseTokenAnnotationType != null && baseTokenAnnotationType.length() > 0 && searchBaseTokensInAllAnnotationSets) {
// we set this to true and if we find basetokens in any of the
// annotationsets to index
// we will set this to false
createManualTokens = true;
for (String aSet : annotSetsToIndex) {
if (aSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
AnnotationSet tempSet = gateDoc.getAnnotations().get(baseTokenAnnotationType);
if (tempSet.size() > 0) {
baseTokenAnnotationSet = tempSet;
// System.out.println("found in default annotation set");
createManualTokens = false;
break;
}
} else {
AnnotationSet tempSet = gateDoc.getAnnotations(aSet).get(baseTokenAnnotationType);
if (tempSet.size() > 0) {
baseTokenAnnotationSet = tempSet;
// System.out.println("found in "+aSet);
createManualTokens = false;
break;
}
}
}
}
// we'll have to create tokens ourselves
if (baseTokenAnnotationType == null || baseTokenAnnotationType.length() == 0)
createManualTokens = true;
// lets check if we have to create ManualTokens
if (createManualTokens) {
if (!createTokensAutomatically.booleanValue()) {
System.out.println("Tokens couldn't be found in the document - Ignoring the document " + gateDoc.getName());
return null;
}
baseTokenAnnotationType = Constants.ANNIC_TOKEN;
if (baseTokenAnnotationSet == null) {
baseTokenAnnotationSet = new AnnotationSetImpl(gateDoc);
}
if (!createTokens(gateDoc, baseTokenAnnotationSet)) {
System.out.println("Tokens couldn't be created manually - Ignoring the document " + gateDoc.getName());
return null;
}
}
// by now, baseTokenAnnotationSet will not be null for sure and we
// know what's the baseTokenAnnotationType
// lets find out the annotation set that contains
// indexUnitAnnotationType in it
AnnotationSet indexUnitAnnotationSet = null;
// lets check if user has provided setName.indexUnitAnnotationType
index = -1;
if (indexUnitAnnotationType != null && indexUnitAnnotationType.trim().length() > 0)
index = indexUnitAnnotationType.lastIndexOf('.');
// indexUnitAnnotationType
if (index >= 0) {
// setName
String setName = indexUnitAnnotationType.substring(0, index);
// indexUnitAnnotationType
indexUnitAnnotationType = indexUnitAnnotationType.substring(index + 1, indexUnitAnnotationType.length());
if (setName.equals(Constants.DEFAULT_ANNOTATION_SET_NAME))
indexUnitAnnotationSet = gateDoc.getAnnotations().get(indexUnitAnnotationType);
else
indexUnitAnnotationSet = gateDoc.getAnnotations(setName).get(indexUnitAnnotationType);
// if so, we'll have to search other annotation sets
if (indexUnitAnnotationSet == null || indexUnitAnnotationSet.size() == 0) {
System.err.println("Index Unit " + indexUnitAnnotationType + " counldn't be found under the specified annotation set " + setName + "\n searching them in other annotation sets");
searchIndexUnitInAllAnnotationSets = true;
}
} else {
// either indexUnitAnnotationType is null or user hasn't provided
// the setname
searchIndexUnitInAllAnnotationSets = true;
}
// searching in all annotation set names
if (indexUnitAnnotationType != null && indexUnitAnnotationType.length() > 0 && searchIndexUnitInAllAnnotationSets) {
for (String aSet : annotSetsToIndex) {
if (aSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
AnnotationSet tempSet = gateDoc.getAnnotations().get(indexUnitAnnotationType);
if (tempSet.size() > 0) {
indexUnitAnnotationSet = tempSet;
break;
}
} else {
AnnotationSet tempSet = gateDoc.getAnnotations(aSet).get(indexUnitAnnotationType);
if (tempSet.size() > 0) {
indexUnitAnnotationSet = tempSet;
break;
}
}
}
}
// to null as well
if (indexUnitAnnotationSet == null) {
indexUnitAnnotationType = null;
}
int j = 0;
for (String annotSet : annotSetsToIndex) {
// we need to generate the Token Stream here, and send it to the
// GateLuceneReader
AnnotationSet aSetToIndex = annotSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME) ? gateDoc.getAnnotations() : gateDoc.getAnnotations(annotSet);
Set<String> indexedFeatures = new HashSet<String>();
// tempBaseTokenAnnotationSet is not null
List<Token>[] tokenStreams = getTokens(gateDoc, aSetToIndex, featuresToInclude, featuresToExclude, baseTokenAnnotationType, baseTokenAnnotationSet, indexUnitAnnotationType, indexUnitAnnotationSet, indexedFeatures);
// tokenStream is set to null
if (tokenStreams == null)
return null;
// this is enabled only if there are more than one annotation sets
// available to search in
// if(createMergeSet) {
// if(mergedSet == null) mergedSet = new AnnotationSetImpl(gateDoc);
//
// // we need to merge all annotations but the
// // baseTokenAnnotationType
// for(String aType : aSetToIndex.getAllTypes()) {
//
// if(aType.equals(baseTokenAnnotationType)) {
// continue;
// }
//
// if(indexUnitAnnotationType != null
// && aType.equals(indexUnitAnnotationType)) {
// continue;
// }
//
// for(Annotation a : aSetToIndex.get(aType)) {
// try {
// mergedSet.add(a.getStartNode().getOffset(), a.getEndNode()
// .getOffset(), a.getType(), a.getFeatures());
// }
// catch(InvalidOffsetException ioe) {
// throw new GateRuntimeException(ioe);
// }
// }
//
// }
// }
StringBuffer indexedFeaturesString = new StringBuffer();
for (String aFeat : indexedFeatures) {
indexedFeaturesString.append(aFeat + ";");
}
Document[] toReturn = new Document[tokenStreams.length];
for (int i = 0; i < tokenStreams.length; i++, j++) {
// make a new, empty document
Document doc = new Document();
// and then create the document
LuceneReader reader = new LuceneReader(gateDoc, tokenStreams[i]);
doc.add(Field.Keyword(Constants.DOCUMENT_ID, documentID));
doc.add(Field.Keyword(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE, documentID + "-" + j));
doc.add(Field.Keyword(Constants.INDEXED_FEATURES, indexedFeaturesString.substring(0, indexedFeaturesString.length() - 1)));
if (corpusPersistenceID != null)
doc.add(Field.Keyword(Constants.CORPUS_ID, corpusPersistenceID));
doc.add(Field.Keyword(Constants.ANNOTATION_SET_ID, annotSet));
doc.add(Field.Text("contents", reader));
// here we store token stream on the file system
try {
writeOnDisk(tokenStreams[i], documentID, documentID + "-" + j, indexLocation);
} catch (Exception e) {
Err.println("\nIgnoring the document : " + gateDoc.getName() + " since its token stream cannot be written on the disk");
Err.println("Reason: " + e.getMessage());
return null;
}
// return the document
toReturn[i] = doc;
}
toReturnBack.addAll(Arrays.asList(toReturn));
}
return toReturnBack;
}
use of gate.util.InvalidOffsetException in project gate-core by GateNLP.
the class DocumentExportMenu method getSelectedFile.
private File getSelectedFile(List<List<Parameter>> params, DocumentExporter de, FeatureMap options) {
File selectedFile = null;
Document document = (handle.getTarget() instanceof Document ? (Document) handle.getTarget() : null);
// are we looking for a file or a directory?
boolean singleFile = (document != null) || (de instanceof CorpusExporter);
if (document != null && document.getSourceUrl() != null) {
String fileName = "";
try {
fileName = document.getSourceUrl().toURI().getPath().trim();
} catch (URISyntaxException e) {
fileName = document.getSourceUrl().getPath().trim();
}
if (fileName.equals("") || fileName.equals("/")) {
if (document.getNamedAnnotationSets().containsKey("Original markups") && !document.getAnnotations("Original markups").get("title").isEmpty()) {
// use the title annotation if any
try {
fileName = document.getContent().getContent(document.getAnnotations("Original markups").get("title").firstNode().getOffset(), document.getAnnotations("Original markups").get("title").lastNode().getOffset()).toString();
} catch (InvalidOffsetException e) {
e.printStackTrace();
}
} else {
fileName = document.getSourceUrl().toString();
}
// cleans the file name
fileName = fileName.replaceAll("/", "_");
} else {
// replaces the extension with the default
fileName = fileName.replaceAll("\\.[a-zA-Z]{1,4}$", "." + de.getDefaultExtension());
}
// cleans the file name
fileName = fileName.replaceAll("[^/a-zA-Z0-9._-]", "_");
fileName = fileName.replaceAll("__+", "_");
// adds the default extension if not present
if (!fileName.endsWith("." + de.getDefaultExtension())) {
fileName += "." + de.getDefaultExtension();
}
selectedFile = new File(fileName);
}
if (params == null || params.isEmpty()) {
XJFileChooser fileChooser = MainFrame.getFileChooser();
fileChooser.resetChoosableFileFilters();
fileChooser.setFileFilter(de.getFileFilter());
fileChooser.setMultiSelectionEnabled(false);
fileChooser.setDialogTitle("Save as " + de.getFileType());
fileChooser.setFileSelectionMode(singleFile ? JFileChooser.FILES_ONLY : JFileChooser.DIRECTORIES_ONLY);
if (selectedFile != null) {
fileChooser.ensureFileIsVisible(selectedFile);
fileChooser.setSelectedFile(selectedFile);
}
if (fileChooser.showSaveDialog(MainFrame.getInstance()) != JFileChooser.APPROVE_OPTION)
return null;
selectedFile = fileChooser.getSelectedFile();
} else {
if (!dialog.show(de, params, singleFile, selectedFile != null ? selectedFile.getAbsolutePath() : ""))
return null;
options.putAll(dialog.getSelectedParameters());
selectedFile = new File(dialog.getSelectedFileName());
}
return selectedFile;
}
use of gate.util.InvalidOffsetException in project gate-core by GateNLP.
the class HtmlDocumentHandler method handleEndTag.
// handleStartTag
/**
* This method is called when the HTML parser encounts the end of a tag
* that means that the tag is paired by a beginning tag
*/
@Override
public void handleEndTag(HTML.Tag t, int pos) {
// obj is for internal use
CustomObject obj = null;
// end of STYLE tag
if (HTML.Tag.STYLE.equals(t)) {
isInsideStyleTag = false;
}
// If the stack is not empty then we get the object from the stack
if (!stack.isEmpty()) {
obj = stack.pop();
// emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
if (obj.getStart().equals(obj.getEnd())) {
// The element had an end tag and its start was equal to its end. Hence
// it is anEmptyAndSpan one.
obj.getFM().put("isEmptyAndSpan", "true");
}
// End iff
// we add it to the colector
colector.add(obj);
}
// If element has text between, then customize its apearance
if (obj != null && obj.getStart().longValue() != obj.getEnd().longValue())
// Customize the appearance of the document
customizeAppearanceOfDocumentWithEndTag(t);
// if t is the </HTML> tag then we reached the end of theHTMLdocument
if (t == HTML.Tag.HTML) {
// replace the old content with the new one
doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
// set from this gate document
if (basicAS == null)
basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
// sort colector ascending on its id
Collections.sort(colector);
// iterate through colector and construct annotations
while (!colector.isEmpty()) {
obj = colector.getFirst();
colector.remove(obj);
// Construct an annotation from this obj
try {
if (markupElementsMap == null) {
basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
} else {
String annotationType = markupElementsMap.get(obj.getElemName());
if (annotationType != null)
basicAS.add(obj.getStart(), obj.getEnd(), annotationType, obj.getFM());
}
} catch (InvalidOffsetException e) {
Err.prln("Error creating an annot :" + obj + " Discarded...");
}
// end try
// }// end if
}
// while
// notify the listener about the total amount of elements that
// has been processed
fireStatusChangedEvent("Total elements : " + elements);
}
// else
}
use of gate.util.InvalidOffsetException in project gate-core by GateNLP.
the class DocumentStaxUtils method readXces.
/**
* Read XML data in <a href="http://www.xces.org/">XCES</a> format
* from the given reader and add the corresponding annotations to the
* given annotation set. The reader must be positioned on the starting
* <code>cesAna</code> tag and will be left pointing to the
* corresponding end tag.
*
* @param xsr the XMLStreamReader to read from.
* @param as the annotation set to read into.
* @throws XMLStreamException
*/
public static void readXces(XMLStreamReader xsr, AnnotationSet as) throws XMLStreamException {
xsr.require(XMLStreamConstants.START_ELEMENT, XCES_NAMESPACE, "cesAna");
// Set of all annotation IDs in this set.
Set<Integer> allAnnotIds = new TreeSet<Integer>();
// pre-populate with the IDs of any existing annotations in the set
for (Annotation a : as) {
allAnnotIds.add(a.getId());
}
// lists to collect the annotations in before adding them to the
// set. We collect the annotations that specify and ID (via
// struct/@n) in one list and those that don't in another, so we can
// add the identified ones first, then the others will take the next
// available ID
List<AnnotationObject> collectedIdentifiedAnnots = new ArrayList<AnnotationObject>();
List<AnnotationObject> collectedNonIdentifiedAnnots = new ArrayList<AnnotationObject>();
while (xsr.nextTag() == XMLStreamConstants.START_ELEMENT) {
xsr.require(XMLStreamConstants.START_ELEMENT, XCES_NAMESPACE, "struct");
AnnotationObject annObj = new AnnotationObject();
annObj.setElemName(xsr.getAttributeValue(null, "type"));
try {
annObj.setStart(Long.valueOf(xsr.getAttributeValue(null, "from")));
} catch (NumberFormatException nfe) {
throw new XMLStreamException("Non-integer value found for struct/@from", xsr.getLocation());
}
try {
annObj.setEnd(Long.valueOf(xsr.getAttributeValue(null, "to")));
} catch (NumberFormatException nfe) {
throw new XMLStreamException("Non-integer value found for struct/@to", xsr.getLocation());
}
String annotIdString = xsr.getAttributeValue(null, "n");
if (annotIdString != null) {
try {
Integer annotationId = Integer.valueOf(annotIdString);
if (allAnnotIds.contains(annotationId)) {
throw new XMLStreamException("Annotation IDs must be unique " + "within an annotation set. Found duplicate ID", xsr.getLocation());
}
allAnnotIds.add(annotationId);
annObj.setId(annotationId);
} catch (NumberFormatException nfe) {
throw new XMLStreamException("Non-integer annotation ID found", xsr.getLocation());
}
}
// get the features of this annotation
annObj.setFM(readXcesFeatureMap(xsr));
// readFeatureMap leaves xsr on the </Annotation> tag
if (annObj.getId() != null) {
collectedIdentifiedAnnots.add(annObj);
} else {
collectedNonIdentifiedAnnots.add(annObj);
}
}
// finished reading, add the annotations to the set
AnnotationObject a = null;
try {
// first the ones that specify an ID
Iterator<AnnotationObject> it = collectedIdentifiedAnnots.iterator();
while (it.hasNext()) {
a = it.next();
as.add(a.getId(), a.getStart(), a.getEnd(), a.getElemName(), a.getFM());
}
// next the ones that don't
it = collectedNonIdentifiedAnnots.iterator();
while (it.hasNext()) {
a = it.next();
as.add(a.getStart(), a.getEnd(), a.getElemName(), a.getFM());
}
} catch (InvalidOffsetException ioe) {
throw new XMLStreamException("Invalid offset when creating annotation " + a, ioe);
}
}
use of gate.util.InvalidOffsetException in project gate-core by GateNLP.
the class AnnotationSetImpl method addAll.
// add(o)
/**
* Adds multiple annotations to this set in one go. All the objects in the
* provided collection should be of {@link gate.Annotation} type, otherwise a
* ClassCastException will be thrown. The provided annotations will be used to
* create new annotations using the appropriate add() methods from this set.
* The new annotations will have different IDs from the old ones (which is
* required in order to preserve the uniqueness of IDs inside an annotation
* set).
*
* @param c
* a collection of annotations
* @return <tt>true</tt> if the set has been modified as a result of this
* call.
*/
@Override
public boolean addAll(Collection<? extends Annotation> c) {
Iterator<? extends Annotation> annIter = c.iterator();
boolean changed = false;
while (annIter.hasNext()) {
Annotation a = annIter.next();
try {
add(a.getStartNode().getOffset(), a.getEndNode().getOffset(), a.getType(), a.getFeatures());
changed = true;
} catch (InvalidOffsetException ioe) {
throw new IllegalArgumentException(ioe.toString());
}
}
return changed;
}
Aggregations