Search in sources :

Example 1 with InvalidOffsetException

use of gate.util.InvalidOffsetException in project gate-core by GateNLP.

the class LuceneDocument method createDocuments.

 * Given an instance of Gate Document, it converts it into the format that
 * lucene can understand and can store in its indexes. This method also stores
 * the tokenStream on the disk in order to retrieve it at the time of
 * searching
public List<Document> createDocuments(String corpusPersistenceID, gate.Document gateDoc, String documentID, List<String> annotSetsToInclude, List<String> annotSetsToExclude, List<String> featuresToInclude, List<String> featuresToExclude, String indexLocation, String baseTokenAnnotationType, Boolean createTokensAutomatically, String indexUnitAnnotationType) {
    if (baseTokenAnnotationType != null)
        baseTokenAnnotationType = baseTokenAnnotationType.trim();
    List<Document> toReturnBack = new ArrayList<Document>();
    List<String> annotSetsToIndex = new ArrayList<String>();
    // about annotation sets to exclude
    if (annotSetsToInclude.size() > 0) {
        annotSetsToIndex = annotSetsToInclude;
    // if there's only one annotation to index, we don't need to
    // create a MergeSet
    // if(annotSetsToIndex.size() == 1) createMergeSet = false;
    } else if (annotSetsToExclude.size() > 0) {
        // if there were no annotation sets to include, check if user has
        // provided any annotation sets to exclude
        // if so, we need to index all annotation sets but provided in the
        // annotationsetstoexclude list
        Set<String> namedAnnotSets = new HashSet<String>();
        if (gateDoc.getNamedAnnotationSets() != null && gateDoc.getNamedAnnotationSets().keySet() != null) {
            namedAnnotSets = gateDoc.getNamedAnnotationSets().keySet();
        for (String setName : namedAnnotSets) {
            if (annotSetsToExclude.contains(setName))
        if (!annotSetsToExclude.contains(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
    } else {
        // if both annotation sets to include and annotation sets to
        // exclude are empty
        // we need to index all annotation sets
        Set<String> namedAnnotSets = new HashSet<String>();
        if (gateDoc.getNamedAnnotationSets() != null && gateDoc.getNamedAnnotationSets().keySet() != null) {
            namedAnnotSets = gateDoc.getNamedAnnotationSets().keySet();
        for (String setName : namedAnnotSets) {
    // lets find out the annotation set that contains tokens in it
    AnnotationSet baseTokenAnnotationSet = null;
    // search in annotation sets to find out which of them has the
    // baseTokenAnnotationType annotations
    // initially this is set to false
    boolean searchBaseTokensInAllAnnotationSets = false;
    boolean searchIndexUnitInAllAnnotationSets = false;
    // this variable tells whether we want to create manual tokens or
    // not
    boolean createManualTokens = false;
    // lets check if user's input is setName.basetokenAnnotationType
    int index = -1;
    if (baseTokenAnnotationType != null && baseTokenAnnotationType.length() > 0)
        index = baseTokenAnnotationType.lastIndexOf('.');
    // basetokenAnnotationType
    if (index >= 0) {
        // set name
        String setName = baseTokenAnnotationType.substring(0, index);
        // token type
        baseTokenAnnotationType = baseTokenAnnotationType.substring(index + 1, baseTokenAnnotationType.length());
        // annotation set
        if (setName.equals(Constants.DEFAULT_ANNOTATION_SET_NAME))
            baseTokenAnnotationSet = gateDoc.getAnnotations().get(baseTokenAnnotationType);
            baseTokenAnnotationSet = gateDoc.getAnnotations(setName).get(baseTokenAnnotationType);
        // base token annotation type
        if (baseTokenAnnotationSet == null || baseTokenAnnotationSet.size() == 0) {
            System.err.println("Base Tokens " + baseTokenAnnotationType + " counldn't be found under the specified annotation set " + setName + "\n searching them in other annotation sets");
            searchBaseTokensInAllAnnotationSets = true;
    } else {
        // either baseTokenAnnotation type is null or user hasn't provided
        // any annotaiton set name
        // so we search in all annotation sets
        searchBaseTokensInAllAnnotationSets = true;
    if (baseTokenAnnotationType != null && baseTokenAnnotationType.length() > 0 && searchBaseTokensInAllAnnotationSets) {
        // we set this to true and if we find basetokens in any of the
        // annotationsets to index
        // we will set this to false
        createManualTokens = true;
        for (String aSet : annotSetsToIndex) {
            if (aSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
                AnnotationSet tempSet = gateDoc.getAnnotations().get(baseTokenAnnotationType);
                if (tempSet.size() > 0) {
                    baseTokenAnnotationSet = tempSet;
                    // System.out.println("found in default annotation set");
                    createManualTokens = false;
            } else {
                AnnotationSet tempSet = gateDoc.getAnnotations(aSet).get(baseTokenAnnotationType);
                if (tempSet.size() > 0) {
                    baseTokenAnnotationSet = tempSet;
                    // System.out.println("found in "+aSet);
                    createManualTokens = false;
    // we'll have to create tokens ourselves
    if (baseTokenAnnotationType == null || baseTokenAnnotationType.length() == 0)
        createManualTokens = true;
    // lets check if we have to create ManualTokens
    if (createManualTokens) {
        if (!createTokensAutomatically.booleanValue()) {
            System.out.println("Tokens couldn't be found in the document - Ignoring the document " + gateDoc.getName());
            return null;
        baseTokenAnnotationType = Constants.ANNIC_TOKEN;
        if (baseTokenAnnotationSet == null) {
            baseTokenAnnotationSet = new AnnotationSetImpl(gateDoc);
        if (!createTokens(gateDoc, baseTokenAnnotationSet)) {
            System.out.println("Tokens couldn't be created manually - Ignoring the document " + gateDoc.getName());
            return null;
    // by now, baseTokenAnnotationSet will not be null for sure and we
    // know what's the baseTokenAnnotationType
    // lets find out the annotation set that contains
    // indexUnitAnnotationType in it
    AnnotationSet indexUnitAnnotationSet = null;
    // lets check if user has provided setName.indexUnitAnnotationType
    index = -1;
    if (indexUnitAnnotationType != null && indexUnitAnnotationType.trim().length() > 0)
        index = indexUnitAnnotationType.lastIndexOf('.');
    // indexUnitAnnotationType
    if (index >= 0) {
        // setName
        String setName = indexUnitAnnotationType.substring(0, index);
        // indexUnitAnnotationType
        indexUnitAnnotationType = indexUnitAnnotationType.substring(index + 1, indexUnitAnnotationType.length());
        if (setName.equals(Constants.DEFAULT_ANNOTATION_SET_NAME))
            indexUnitAnnotationSet = gateDoc.getAnnotations().get(indexUnitAnnotationType);
            indexUnitAnnotationSet = gateDoc.getAnnotations(setName).get(indexUnitAnnotationType);
        // if so, we'll have to search other annotation sets
        if (indexUnitAnnotationSet == null || indexUnitAnnotationSet.size() == 0) {
            System.err.println("Index Unit " + indexUnitAnnotationType + " counldn't be found under the specified annotation set " + setName + "\n searching them in other annotation sets");
            searchIndexUnitInAllAnnotationSets = true;
    } else {
        // either indexUnitAnnotationType is null or user hasn't provided
        // the setname
        searchIndexUnitInAllAnnotationSets = true;
    // searching in all annotation set names
    if (indexUnitAnnotationType != null && indexUnitAnnotationType.length() > 0 && searchIndexUnitInAllAnnotationSets) {
        for (String aSet : annotSetsToIndex) {
            if (aSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
                AnnotationSet tempSet = gateDoc.getAnnotations().get(indexUnitAnnotationType);
                if (tempSet.size() > 0) {
                    indexUnitAnnotationSet = tempSet;
            } else {
                AnnotationSet tempSet = gateDoc.getAnnotations(aSet).get(indexUnitAnnotationType);
                if (tempSet.size() > 0) {
                    indexUnitAnnotationSet = tempSet;
    // to null as well
    if (indexUnitAnnotationSet == null) {
        indexUnitAnnotationType = null;
    int j = 0;
    for (String annotSet : annotSetsToIndex) {
        // we need to generate the Token Stream here, and send it to the
        // GateLuceneReader
        AnnotationSet aSetToIndex = annotSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME) ? gateDoc.getAnnotations() : gateDoc.getAnnotations(annotSet);
        Set<String> indexedFeatures = new HashSet<String>();
        // tempBaseTokenAnnotationSet is not null
        List<Token>[] tokenStreams = getTokens(gateDoc, aSetToIndex, featuresToInclude, featuresToExclude, baseTokenAnnotationType, baseTokenAnnotationSet, indexUnitAnnotationType, indexUnitAnnotationSet, indexedFeatures);
        // tokenStream is set to null
        if (tokenStreams == null)
            return null;
        // this is enabled only if there are more than one annotation sets
        // available to search in
        // if(createMergeSet) {
        // if(mergedSet == null) mergedSet = new AnnotationSetImpl(gateDoc);
        // // we need to merge all annotations but the
        // // baseTokenAnnotationType
        // for(String aType : aSetToIndex.getAllTypes()) {
        // if(aType.equals(baseTokenAnnotationType)) {
        // continue;
        // }
        // if(indexUnitAnnotationType != null
        // && aType.equals(indexUnitAnnotationType)) {
        // continue;
        // }
        // for(Annotation a : aSetToIndex.get(aType)) {
        // try {
        // mergedSet.add(a.getStartNode().getOffset(), a.getEndNode()
        // .getOffset(), a.getType(), a.getFeatures());
        // }
        // catch(InvalidOffsetException ioe) {
        // throw new GateRuntimeException(ioe);
        // }
        // }
        // }
        // }
        StringBuffer indexedFeaturesString = new StringBuffer();
        for (String aFeat : indexedFeatures) {
            indexedFeaturesString.append(aFeat + ";");
        Document[] toReturn = new Document[tokenStreams.length];
        for (int i = 0; i < tokenStreams.length; i++, j++) {
            // make a new, empty document
            Document doc = new Document();
            // and then create the document
            LuceneReader reader = new LuceneReader(gateDoc, tokenStreams[i]);
            doc.add(Field.Keyword(Constants.DOCUMENT_ID, documentID));
            doc.add(Field.Keyword(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE, documentID + "-" + j));
            doc.add(Field.Keyword(Constants.INDEXED_FEATURES, indexedFeaturesString.substring(0, indexedFeaturesString.length() - 1)));
            if (corpusPersistenceID != null)
                doc.add(Field.Keyword(Constants.CORPUS_ID, corpusPersistenceID));
            doc.add(Field.Keyword(Constants.ANNOTATION_SET_ID, annotSet));
            doc.add(Field.Text("contents", reader));
            // here we store token stream on the file system
            try {
                writeOnDisk(tokenStreams[i], documentID, documentID + "-" + j, indexLocation);
            } catch (Exception e) {
                Err.println("\nIgnoring the document : " + gateDoc.getName() + " since its token stream cannot be written on the disk");
                Err.println("Reason: " + e.getMessage());
                return null;
            // return the document
            toReturn[i] = doc;
    return toReturnBack;
Also used : HashSet(java.util.HashSet) AnnotationSet(gate.AnnotationSet) Set(java.util.Set) ArrayList(java.util.ArrayList) AnnotationSet(gate.AnnotationSet) Document(gate.creole.annic.apache.lucene.document.Document) InvalidOffsetException(gate.util.InvalidOffsetException) GateRuntimeException(gate.util.GateRuntimeException) IOException( AnnotationSetImpl(gate.annotation.AnnotationSetImpl) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet)

Example 2 with InvalidOffsetException

use of gate.util.InvalidOffsetException in project gate-core by GateNLP.

the class DocumentExportMenu method getSelectedFile.

private File getSelectedFile(List<List<Parameter>> params, DocumentExporter de, FeatureMap options) {
    File selectedFile = null;
    Document document = (handle.getTarget() instanceof Document ? (Document) handle.getTarget() : null);
    // are we looking for a file or a directory?
    boolean singleFile = (document != null) || (de instanceof CorpusExporter);
    if (document != null && document.getSourceUrl() != null) {
        String fileName = "";
        try {
            fileName = document.getSourceUrl().toURI().getPath().trim();
        } catch (URISyntaxException e) {
            fileName = document.getSourceUrl().getPath().trim();
        if (fileName.equals("") || fileName.equals("/")) {
            if (document.getNamedAnnotationSets().containsKey("Original markups") && !document.getAnnotations("Original markups").get("title").isEmpty()) {
                // use the title annotation if any
                try {
                    fileName = document.getContent().getContent(document.getAnnotations("Original markups").get("title").firstNode().getOffset(), document.getAnnotations("Original markups").get("title").lastNode().getOffset()).toString();
                } catch (InvalidOffsetException e) {
            } else {
                fileName = document.getSourceUrl().toString();
            // cleans the file name
            fileName = fileName.replaceAll("/", "_");
        } else {
            // replaces the extension with the default
            fileName = fileName.replaceAll("\\.[a-zA-Z]{1,4}$", "." + de.getDefaultExtension());
        // cleans the file name
        fileName = fileName.replaceAll("[^/a-zA-Z0-9._-]", "_");
        fileName = fileName.replaceAll("__+", "_");
        // adds the default extension if not present
        if (!fileName.endsWith("." + de.getDefaultExtension())) {
            fileName += "." + de.getDefaultExtension();
        selectedFile = new File(fileName);
    if (params == null || params.isEmpty()) {
        XJFileChooser fileChooser = MainFrame.getFileChooser();
        fileChooser.setDialogTitle("Save as " + de.getFileType());
        fileChooser.setFileSelectionMode(singleFile ? JFileChooser.FILES_ONLY : JFileChooser.DIRECTORIES_ONLY);
        if (selectedFile != null) {
        if (fileChooser.showSaveDialog(MainFrame.getInstance()) != JFileChooser.APPROVE_OPTION)
            return null;
        selectedFile = fileChooser.getSelectedFile();
    } else {
        if (!, params, singleFile, selectedFile != null ? selectedFile.getAbsolutePath() : ""))
            return null;
        selectedFile = new File(dialog.getSelectedFileName());
    return selectedFile;
Also used : XJFileChooser(gate.swing.XJFileChooser) CorpusExporter(gate.CorpusExporter) InvalidOffsetException(gate.util.InvalidOffsetException) URISyntaxException( Document(gate.Document) File(

Example 3 with InvalidOffsetException

use of gate.util.InvalidOffsetException in project gate-core by GateNLP.

the class HtmlDocumentHandler method handleEndTag.

// handleStartTag
 * This method is called when the HTML parser encounts the end of a tag
 * that means that the tag is paired by a beginning tag
public void handleEndTag(HTML.Tag t, int pos) {
    // obj is for internal use
    CustomObject obj = null;
    // end of STYLE tag
    if (HTML.Tag.STYLE.equals(t)) {
        isInsideStyleTag = false;
    // If the stack is not empty then we get the object from the stack
    if (!stack.isEmpty()) {
        obj = stack.pop();
        // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
        if (obj.getStart().equals(obj.getEnd())) {
            // The element had an end tag and its start was equal to its end. Hence
            // it is anEmptyAndSpan one.
            obj.getFM().put("isEmptyAndSpan", "true");
        // End iff
        // we add it to the colector
    // If element has text between, then customize its apearance
    if (obj != null && obj.getStart().longValue() != obj.getEnd().longValue())
        // Customize the appearance of the document
    // if t is the </HTML> tag then we reached the end of theHTMLdocument
    if (t == HTML.Tag.HTML) {
        // replace the old content with the new one
        doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
        // set from this gate document
        if (basicAS == null)
            basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
        // sort colector ascending on its id
        // iterate through colector and construct annotations
        while (!colector.isEmpty()) {
            obj = colector.getFirst();
            // Construct an annotation from this obj
            try {
                if (markupElementsMap == null) {
                    basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
                } else {
                    String annotationType = markupElementsMap.get(obj.getElemName());
                    if (annotationType != null)
                        basicAS.add(obj.getStart(), obj.getEnd(), annotationType, obj.getFM());
            } catch (InvalidOffsetException e) {
                Err.prln("Error creating an annot :" + obj + " Discarded...");
        // end try
        // }// end if
        // while
        // notify the listener about the total amount of elements that
        // has been processed
        fireStatusChangedEvent("Total elements : " + elements);
// else
Also used : DocumentContentImpl(gate.corpora.DocumentContentImpl) InvalidOffsetException(gate.util.InvalidOffsetException)

Example 4 with InvalidOffsetException

use of gate.util.InvalidOffsetException in project gate-core by GateNLP.

the class DocumentStaxUtils method readXces.

 * Read XML data in <a href="">XCES</a> format
 * from the given reader and add the corresponding annotations to the
 * given annotation set. The reader must be positioned on the starting
 * <code>cesAna</code> tag and will be left pointing to the
 * corresponding end tag.
 * @param xsr the XMLStreamReader to read from.
 * @param as the annotation set to read into.
 * @throws XMLStreamException
public static void readXces(XMLStreamReader xsr, AnnotationSet as) throws XMLStreamException {
    xsr.require(XMLStreamConstants.START_ELEMENT, XCES_NAMESPACE, "cesAna");
    // Set of all annotation IDs in this set.
    Set<Integer> allAnnotIds = new TreeSet<Integer>();
    // pre-populate with the IDs of any existing annotations in the set
    for (Annotation a : as) {
    // lists to collect the annotations in before adding them to the
    // set. We collect the annotations that specify and ID (via
    // struct/@n) in one list and those that don't in another, so we can
    // add the identified ones first, then the others will take the next
    // available ID
    List<AnnotationObject> collectedIdentifiedAnnots = new ArrayList<AnnotationObject>();
    List<AnnotationObject> collectedNonIdentifiedAnnots = new ArrayList<AnnotationObject>();
    while (xsr.nextTag() == XMLStreamConstants.START_ELEMENT) {
        xsr.require(XMLStreamConstants.START_ELEMENT, XCES_NAMESPACE, "struct");
        AnnotationObject annObj = new AnnotationObject();
        annObj.setElemName(xsr.getAttributeValue(null, "type"));
        try {
            annObj.setStart(Long.valueOf(xsr.getAttributeValue(null, "from")));
        } catch (NumberFormatException nfe) {
            throw new XMLStreamException("Non-integer value found for struct/@from", xsr.getLocation());
        try {
            annObj.setEnd(Long.valueOf(xsr.getAttributeValue(null, "to")));
        } catch (NumberFormatException nfe) {
            throw new XMLStreamException("Non-integer value found for struct/@to", xsr.getLocation());
        String annotIdString = xsr.getAttributeValue(null, "n");
        if (annotIdString != null) {
            try {
                Integer annotationId = Integer.valueOf(annotIdString);
                if (allAnnotIds.contains(annotationId)) {
                    throw new XMLStreamException("Annotation IDs must be unique " + "within an annotation set. Found duplicate ID", xsr.getLocation());
            } catch (NumberFormatException nfe) {
                throw new XMLStreamException("Non-integer annotation ID found", xsr.getLocation());
        // get the features of this annotation
        // readFeatureMap leaves xsr on the </Annotation> tag
        if (annObj.getId() != null) {
        } else {
    // finished reading, add the annotations to the set
    AnnotationObject a = null;
    try {
        // first the ones that specify an ID
        Iterator<AnnotationObject> it = collectedIdentifiedAnnots.iterator();
        while (it.hasNext()) {
            a =;
            as.add(a.getId(), a.getStart(), a.getEnd(), a.getElemName(), a.getFM());
        // next the ones that don't
        it = collectedNonIdentifiedAnnots.iterator();
        while (it.hasNext()) {
            a =;
            as.add(a.getStart(), a.getEnd(), a.getElemName(), a.getFM());
    } catch (InvalidOffsetException ioe) {
        throw new XMLStreamException("Invalid offset when creating annotation " + a, ioe);
Also used : XMLStreamException( TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) InvalidOffsetException(gate.util.InvalidOffsetException) Annotation(gate.Annotation)

Example 5 with InvalidOffsetException

use of gate.util.InvalidOffsetException in project gate-core by GateNLP.

the class AnnotationSetImpl method addAll.

// add(o)
 * Adds multiple annotations to this set in one go. All the objects in the
 * provided collection should be of {@link gate.Annotation} type, otherwise a
 * ClassCastException will be thrown. The provided annotations will be used to
 * create new annotations using the appropriate add() methods from this set.
 * The new annotations will have different IDs from the old ones (which is
 * required in order to preserve the uniqueness of IDs inside an annotation
 * set).
 * @param c
 *          a collection of annotations
 * @return <tt>true</tt> if the set has been modified as a result of this
 *         call.
public boolean addAll(Collection<? extends Annotation> c) {
    Iterator<? extends Annotation> annIter = c.iterator();
    boolean changed = false;
    while (annIter.hasNext()) {
        Annotation a =;
        try {
            add(a.getStartNode().getOffset(), a.getEndNode().getOffset(), a.getType(), a.getFeatures());
            changed = true;
        } catch (InvalidOffsetException ioe) {
            throw new IllegalArgumentException(ioe.toString());
    return changed;
Also used : InvalidOffsetException(gate.util.InvalidOffsetException) Annotation(gate.Annotation)


InvalidOffsetException (gate.util.InvalidOffsetException)15 Annotation (gate.Annotation)6 AnnotationSet (gate.AnnotationSet)5 ArrayList (java.util.ArrayList)5 HashSet (java.util.HashSet)4 FeatureMap (gate.FeatureMap)3 AnnotationSetImpl (gate.annotation.AnnotationSetImpl)3 DocumentFormatException (gate.util.DocumentFormatException)3 GateRuntimeException (gate.util.GateRuntimeException)3 IOException ( List (java.util.List)3 DocumentContentImpl (gate.corpora.DocumentContentImpl)2 XMLStreamException ( CorpusExporter (gate.CorpusExporter)1 Document (gate.Document)1 Node (gate.Node)1 Token (gate.creole.annic.apache.lucene.analysis.Token)1 Document (gate.creole.annic.apache.lucene.document.Document)1 EmailDocumentHandler ( StatusListener (gate.event.StatusListener)1