Examples with AceToken - edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceToken

Example 1 with AceToken

use of edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceToken in project CoreNLP by stanfordnlp.

the class AceReader method readDocument.

/**
 * Reads in a single ACE*.apf.xml file and convert it to RelationSentence
 * objects. However, you probably should call parse() instead.
 *
 * @param prefix prefix of ACE filename to read (e.g.
 *          "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01"
 *          ) (no ".apf.xml" extension)
 * @return list of RelationSentence objects
 */
private List<CoreMap> readDocument(String prefix, Annotation corpus) throws IOException, SAXException, ParserConfigurationException {
    logger.info("Reading document: " + prefix);
    List<CoreMap> results = new ArrayList<>();
    AceDocument aceDocument;
    if (aceVersion.equals("ACE2004")) {
        aceDocument = AceDocument.parseDocument(prefix, false, aceVersion);
    } else {
        aceDocument = AceDocument.parseDocument(prefix, false);
    }
    String docId = aceDocument.getId();
    // map entity mention ID strings to their EntityMention counterparts
    Map<String, EntityMention> entityMentionMap = Generics.newHashMap();
    /*
    for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
      List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
      StringBuilder b = new StringBuilder();
      for(AceToken t: tokens) b.append(t.getLiteral() + " " );
      logger.info("SENTENCE: " + b.toString());
    }
    */
    int tokenOffset = 0;
    for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
        List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
        List<CoreLabel> words = new ArrayList<>();
        StringBuilder textContent = new StringBuilder();
        for (int i = 0; i < tokens.size(); i++) {
            CoreLabel l = new CoreLabel();
            l.setWord(tokens.get(i).getLiteral());
            l.set(CoreAnnotations.ValueAnnotation.class, l.word());
            l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokens.get(i).getByteStart());
            l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokens.get(i).getByteEnd());
            words.add(l);
            if (i > 0)
                textContent.append(" ");
            textContent.append(tokens.get(i).getLiteral());
        }
        // skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer)
        if (words.size() == 1) {
            String word = words.get(0).word();
            if (word.startsWith("<") && word.endsWith(">")) {
                tokenOffset += tokens.size();
                continue;
            }
        }
        CoreMap sentence = new Annotation(textContent.toString());
        sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
        sentence.set(CoreAnnotations.TokensAnnotation.class, words);
        logger.info("Reading sentence: \"" + textContent + "\"");
        List<AceEntityMention> entityMentions = aceDocument.getEntityMentions(sentenceIndex);
        List<AceRelationMention> relationMentions = aceDocument.getRelationMentions(sentenceIndex);
        List<AceEventMention> eventMentions = aceDocument.getEventMentions(sentenceIndex);
        // convert entity mentions
        for (AceEntityMention aceEntityMention : entityMentions) {
            String corefID = "";
            for (String entityID : aceDocument.getKeySetEntities()) {
                AceEntity e = aceDocument.getEntity(entityID);
                if (e.getMentions().contains(aceEntityMention)) {
                    corefID = entityID;
                    break;
                }
            }
            EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID);
            // EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset);
            entityCounts.incrementCount(convertedMention.getType());
            logger.info("CONVERTED MENTION HEAD SPAN: " + convertedMention.getHead());
            logger.info("CONVERTED ENTITY MENTION: " + convertedMention);
            AnnotationUtils.addEntityMention(sentence, convertedMention);
            entityMentionMap.put(aceEntityMention.getId(), convertedMention);
        // TODO: make Entity objects as needed
        }
        // convert relation mentions
        for (AceRelationMention aceRelationMention : relationMentions) {
            RelationMention convertedMention = convertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap);
            if (convertedMention != null) {
                relationCounts.incrementCount(convertedMention.getType());
                logger.info("CONVERTED RELATION MENTION: " + convertedMention);
                AnnotationUtils.addRelationMention(sentence, convertedMention);
            }
        // TODO: make Relation objects
        }
        // convert EventMentions
        for (AceEventMention aceEventMention : eventMentions) {
            EventMention convertedMention = convertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset);
            if (convertedMention != null) {
                eventCounts.incrementCount(convertedMention.getType());
                logger.info("CONVERTED EVENT MENTION: " + convertedMention);
                AnnotationUtils.addEventMention(sentence, convertedMention);
            }
        // TODO: make Event objects
        }
        results.add(sentence);
        tokenOffset += tokens.size();
    }
    return results;
}

Also used : EventMention(edu.stanford.nlp.ie.machinereading.structure.EventMention) AceEventMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEventMention) AceRelationMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMention) RelationMention(edu.stanford.nlp.ie.machinereading.structure.RelationMention) ArrayList(java.util.ArrayList) AceEntity(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntity) AceEventMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEventMention) Annotation(edu.stanford.nlp.pipeline.Annotation) AceDocument(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceDocument) CoreLabel(edu.stanford.nlp.ling.CoreLabel) EntityMention(edu.stanford.nlp.ie.machinereading.structure.EntityMention) AceEntityMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention) AceToken(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceToken) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) AceRelationMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMention) AceEntityMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention) CoreMap(edu.stanford.nlp.util.CoreMap)

Aggregations

AceDocument (edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceDocument)1 AceEntity (edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntity)1 AceEntityMention (edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention)1 AceEventMention (edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEventMention)1 AceRelationMention (edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMention)1 AceToken (edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceToken)1 EntityMention (edu.stanford.nlp.ie.machinereading.structure.EntityMention)1 EventMention (edu.stanford.nlp.ie.machinereading.structure.EventMention)1 RelationMention (edu.stanford.nlp.ie.machinereading.structure.RelationMention)1 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)1 CoreLabel (edu.stanford.nlp.ling.CoreLabel)1 Annotation (edu.stanford.nlp.pipeline.Annotation)1 CoreMap (edu.stanford.nlp.util.CoreMap)1 ArrayList (java.util.ArrayList)1