Examples with AceEntityMention - edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention

Example 1 with AceEntityMention

use of edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention in project CoreNLP by stanfordnlp.

the class AceReader method readDocument.

/**
   * Reads in a single ACE*.apf.xml file and convert it to RelationSentence
   * objects. However, you probably should call parse() instead.
   *
   * @param prefix prefix of ACE filename to read (e.g.
   *          "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01"
   *          ) (no ".apf.xml" extension)
   * @return list of RelationSentence objects
   */
private List<CoreMap> readDocument(String prefix, Annotation corpus) throws IOException, SAXException, ParserConfigurationException {
    logger.info("Reading document: " + prefix);
    List<CoreMap> results = new ArrayList<>();
    AceDocument aceDocument;
    if (aceVersion.equals("ACE2004")) {
        aceDocument = AceDocument.parseDocument(prefix, false, aceVersion);
    } else {
        aceDocument = AceDocument.parseDocument(prefix, false);
    }
    String docId = aceDocument.getId();
    // map entity mention ID strings to their EntityMention counterparts
    Map<String, EntityMention> entityMentionMap = Generics.newHashMap();
    /*
    for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
      List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
      StringBuffer b = new StringBuffer();
      for(AceToken t: tokens) b.append(t.getLiteral() + " " );
      logger.info("SENTENCE: " + b.toString());
    }
    */
    int tokenOffset = 0;
    for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
        List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
        List<CoreLabel> words = new ArrayList<>();
        StringBuilder textContent = new StringBuilder();
        for (int i = 0; i < tokens.size(); i++) {
            CoreLabel l = new CoreLabel();
            l.setWord(tokens.get(i).getLiteral());
            l.set(CoreAnnotations.ValueAnnotation.class, l.word());
            l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokens.get(i).getByteStart());
            l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokens.get(i).getByteEnd());
            words.add(l);
            if (i > 0)
                textContent.append(" ");
            textContent.append(tokens.get(i).getLiteral());
        }
        // skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer)
        if (words.size() == 1) {
            String word = words.get(0).word();
            if (word.startsWith("<") && word.endsWith(">")) {
                tokenOffset += tokens.size();
                continue;
            }
        }
        CoreMap sentence = new Annotation(textContent.toString());
        sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
        sentence.set(CoreAnnotations.TokensAnnotation.class, words);
        logger.info("Reading sentence: \"" + textContent + "\"");
        List<AceEntityMention> entityMentions = aceDocument.getEntityMentions(sentenceIndex);
        List<AceRelationMention> relationMentions = aceDocument.getRelationMentions(sentenceIndex);
        List<AceEventMention> eventMentions = aceDocument.getEventMentions(sentenceIndex);
        // convert entity mentions
        for (AceEntityMention aceEntityMention : entityMentions) {
            String corefID = "";
            for (String entityID : aceDocument.getKeySetEntities()) {
                AceEntity e = aceDocument.getEntity(entityID);
                if (e.getMentions().contains(aceEntityMention)) {
                    corefID = entityID;
                    break;
                }
            }
            EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID);
            //        EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset);
            entityCounts.incrementCount(convertedMention.getType());
            logger.info("CONVERTED MENTION HEAD SPAN: " + convertedMention.getHead());
            logger.info("CONVERTED ENTITY MENTION: " + convertedMention);
            AnnotationUtils.addEntityMention(sentence, convertedMention);
            entityMentionMap.put(aceEntityMention.getId(), convertedMention);
        // TODO: make Entity objects as needed
        }
        // convert relation mentions
        for (AceRelationMention aceRelationMention : relationMentions) {
            RelationMention convertedMention = convertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap);
            if (convertedMention != null) {
                relationCounts.incrementCount(convertedMention.getType());
                logger.info("CONVERTED RELATION MENTION: " + convertedMention);
                AnnotationUtils.addRelationMention(sentence, convertedMention);
            }
        // TODO: make Relation objects
        }
        // convert EventMentions
        for (AceEventMention aceEventMention : eventMentions) {
            EventMention convertedMention = convertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset);
            if (convertedMention != null) {
                eventCounts.incrementCount(convertedMention.getType());
                logger.info("CONVERTED EVENT MENTION: " + convertedMention);
                AnnotationUtils.addEventMention(sentence, convertedMention);
            }
        // TODO: make Event objects
        }
        results.add(sentence);
        tokenOffset += tokens.size();
    }
    return results;
}

Also used : EventMention(edu.stanford.nlp.ie.machinereading.structure.EventMention) AceEventMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEventMention) AceRelationMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMention) RelationMention(edu.stanford.nlp.ie.machinereading.structure.RelationMention) ArrayList(java.util.ArrayList) AceEntity(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntity) AceEventMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEventMention) Annotation(edu.stanford.nlp.pipeline.Annotation) AceDocument(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceDocument) CoreLabel(edu.stanford.nlp.ling.CoreLabel) EntityMention(edu.stanford.nlp.ie.machinereading.structure.EntityMention) AceEntityMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention) AceToken(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceToken) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) AceRelationMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMention) AceEntityMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 2 with AceEntityMention

use of edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention in project CoreNLP by stanfordnlp.

the class AceReader method convertAceEventMention.

private EventMention convertAceEventMention(AceEventMention aceEventMention, String docId, CoreMap sentence, Map<String, EntityMention> entityMap, int tokenOffset) {
    Set<String> roleSet = aceEventMention.getRoles();
    List<String> roles = new ArrayList<>();
    for (String role : roleSet) roles.add(role);
    List<ExtractionObject> convertedArgs = new ArrayList<>();
    int left = Integer.MAX_VALUE;
    int right = Integer.MIN_VALUE;
    for (String role : roles) {
        AceEntityMention arg = aceEventMention.getArg(role);
        ExtractionObject o = entityMap.get(arg.getId());
        if (o == null) {
            logger.severe("READER ERROR: Failed to find event argument with id " + arg.getId());
            logger.severe("This happens because a few event mentions illegally span multiple sentences. Will ignore this mention.");
            return null;
        }
        convertedArgs.add(o);
        if (o.getExtentTokenStart() < left)
            left = o.getExtentTokenStart();
        if (o.getExtentTokenEnd() > right)
            right = o.getExtentTokenEnd();
    }
    AceCharSeq anchor = aceEventMention.getAnchor();
    ExtractionObject anchorObject = new ExtractionObject(aceEventMention.getId() + "-anchor", sentence, new Span(anchor.getTokenStart() - tokenOffset, anchor.getTokenEnd() + 1 - tokenOffset), "ANCHOR", null);
    EventMention em = new EventMention(aceEventMention.getId(), sentence, new Span(left, right), aceEventMention.getParent().getType(), aceEventMention.getParent().getSubtype(), anchorObject, convertedArgs, roles);
    return em;
}

Also used : EventMention(edu.stanford.nlp.ie.machinereading.structure.EventMention) AceEventMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEventMention) ExtractionObject(edu.stanford.nlp.ie.machinereading.structure.ExtractionObject) ArrayList(java.util.ArrayList) AceEntityMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention) Span(edu.stanford.nlp.ie.machinereading.structure.Span) AceCharSeq(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceCharSeq)

Example 3 with AceEntityMention

use of edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention in project CoreNLP by stanfordnlp.

the class AceReader method convertAceEntityMention.

/**
   * Convert an {@link AceEntityMention} to an {@link EntityMention}.
   *
   * @param entityMention {@link AceEntityMention} to convert
   * @param docId ID of the document containing this entity mention
   * @param sentence
   * @param tokenOffset An offset in the calculations of position of the extent to sentence boundary
   *                    (the ace.reader stores absolute token offset from the beginning of the document, but
   *                    we need token offsets from the beginning of the sentence => adjust by tokenOffset)
   * @return entity as an {@link EntityMention}
   */
private EntityMention convertAceEntityMention(AceEntityMention entityMention, String docId, CoreMap sentence, int tokenOffset) {
    //log.info("TYPE is " + entityMention.getParent().getType());
    //log.info("SUBTYPE is " + entityMention.getParent().getSubtype());
    //log.info("LDCTYPE is " + entityMention.getLdctype());
    AceCharSeq ext = entityMention.getExtent();
    AceCharSeq head = entityMention.getHead();
    int extStart = ext.getTokenStart() - tokenOffset;
    int extEnd = ext.getTokenEnd() - tokenOffset + 1;
    if (extStart < 0) {
        logger.severe("READER ERROR: Invalid extent start " + extStart + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
        logger.severe("This may happen due to incorrect EOS detection. Adjusting entity extent.");
        extStart = 0;
    }
    if (extEnd > sentence.get(CoreAnnotations.TokensAnnotation.class).size()) {
        logger.severe("READER ERROR: Invalid extent end " + extEnd + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
        logger.severe("This may happen due to incorrect EOS detection. Adjusting entity extent.");
        extEnd = sentence.get(CoreAnnotations.TokensAnnotation.class).size();
    }
    int headStart = head.getTokenStart() - tokenOffset;
    int headEnd = head.getTokenEnd() - tokenOffset + 1;
    if (headStart < 0) {
        logger.severe("READER ERROR: Invalid head start " + headStart + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
        logger.severe("This may happen due to incorrect EOS detection. Adjusting entity head span.");
        headStart = 0;
    }
    if (headEnd > sentence.get(CoreAnnotations.TokensAnnotation.class).size()) {
        logger.severe("READER ERROR: Invalid head end " + headEnd + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
        logger.severe("This may happen due to incorrect EOS detection. Adjusting entity head span.");
        headEnd = sentence.get(CoreAnnotations.TokensAnnotation.class).size();
    }
    // must adjust due to possible incorrect EOS detection
    if (headStart < extStart) {
        headStart = extStart;
    }
    if (headEnd > extEnd) {
        headEnd = extEnd;
    }
    assert (headStart < headEnd);
    // note: the ace.reader stores absolute token offset from the beginning of the document, but
    //       we need token offsets from the beginning of the sentence => adjust by tokenOffset
    // note: in ace.reader the end token position is inclusive, but
    //       in our setup the end token position is exclusive => add 1 to end
    EntityMention converted = new EntityMention(entityMention.getId(), sentence, new Span(extStart, extEnd), new Span(headStart, headEnd), entityMention.getParent().getType(), entityMention.getParent().getSubtype(), entityMention.getLdctype());
    return converted;
}

Also used : EntityMention(edu.stanford.nlp.ie.machinereading.structure.EntityMention) AceEntityMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) Span(edu.stanford.nlp.ie.machinereading.structure.Span) AceCharSeq(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceCharSeq)

Aggregations

AceEntityMention (edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention)3 AceCharSeq (edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceCharSeq)2 AceEventMention (edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEventMention)2 EntityMention (edu.stanford.nlp.ie.machinereading.structure.EntityMention)2 EventMention (edu.stanford.nlp.ie.machinereading.structure.EventMention)2 Span (edu.stanford.nlp.ie.machinereading.structure.Span)2 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)2 ArrayList (java.util.ArrayList)2 AceDocument (edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceDocument)1 AceEntity (edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntity)1 AceRelationMention (edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMention)1 AceToken (edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceToken)1 ExtractionObject (edu.stanford.nlp.ie.machinereading.structure.ExtractionObject)1 RelationMention (edu.stanford.nlp.ie.machinereading.structure.RelationMention)1 CoreLabel (edu.stanford.nlp.ling.CoreLabel)1 Annotation (edu.stanford.nlp.pipeline.Annotation)1 CoreMap (edu.stanford.nlp.util.CoreMap)1