Search in sources :

Example 56 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.

the class MachineReading method keepPercentage.

/** Keeps only the first percentage sentences from the given corpus */
private static Annotation keepPercentage(Annotation corpus, double percentage) {
    log.info("Using fraction of train: " + percentage);
    if (percentage >= 1.0) {
        return corpus;
    }
    Annotation smaller = new Annotation("");
    List<CoreMap> sents = new ArrayList<>();
    List<CoreMap> fullSents = corpus.get(SentencesAnnotation.class);
    double smallSize = (double) fullSents.size() * percentage;
    for (int i = 0; i < smallSize; i++) {
        sents.add(fullSents.get(i));
    }
    log.info("TRAIN corpus size reduced from " + fullSents.size() + " to " + sents.size());
    smaller.set(SentencesAnnotation.class, sents);
    return smaller;
}
Also used : ArrayList(java.util.ArrayList) CoreMap(edu.stanford.nlp.util.CoreMap) SentencesAnnotation(edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation) TreeAnnotation(edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation) TextAnnotation(edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation) EntityMentionsAnnotation(edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations.EntityMentionsAnnotation) TokensAnnotation(edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation) Annotation(edu.stanford.nlp.pipeline.Annotation)

Example 57 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.

the class MachineReading method train.

protected void train(Annotation training, int partition) throws Exception {
    //
    if (MachineReadingProperties.extractEntities) {
        MachineReadingProperties.logger.info("Training entity extraction model(s)");
        if (partition != -1)
            MachineReadingProperties.logger.info("In partition #" + partition);
        String modelName = MachineReadingProperties.serializedEntityExtractorPath;
        if (partition != -1)
            modelName += "." + partition;
        File modelFile = new File(modelName);
        MachineReadingProperties.logger.fine("forceRetraining = " + this.forceRetraining + ", modelFile.exists = " + modelFile.exists());
        if (!this.forceRetraining && modelFile.exists()) {
            MachineReadingProperties.logger.info("Loading entity extraction model from " + modelName + " ...");
            entityExtractor = BasicEntityExtractor.load(modelName, MachineReadingProperties.entityClassifier, false);
        } else {
            MachineReadingProperties.logger.info("Training entity extraction model...");
            entityExtractor = makeEntityExtractor(MachineReadingProperties.entityClassifier, MachineReadingProperties.entityGazetteerPath);
            entityExtractor.train(training);
            MachineReadingProperties.logger.info("Serializing entity extraction model to " + modelName + " ...");
            entityExtractor.save(modelName);
        }
    }
    //
    if (MachineReadingProperties.extractRelations) {
        MachineReadingProperties.logger.info("Training relation extraction model(s)");
        if (partition != -1)
            MachineReadingProperties.logger.info("In partition #" + partition);
        String modelName = MachineReadingProperties.serializedRelationExtractorPath;
        if (partition != -1)
            modelName += "." + partition;
        if (MachineReadingProperties.useRelationExtractionModelMerging) {
            String[] modelNames = MachineReadingProperties.serializedRelationExtractorPath.split(",");
            if (partition != -1) {
                for (int i = 0; i < modelNames.length; i++) {
                    modelNames[i] += "." + partition;
                }
            }
            relationExtractor = ExtractorMerger.buildRelationExtractorMerger(modelNames);
        } else if (!this.forceRetraining && new File(modelName).exists()) {
            MachineReadingProperties.logger.info("Loading relation extraction model from " + modelName + " ...");
            //TODO change this to load any type of BasicRelationExtractor
            relationExtractor = BasicRelationExtractor.load(modelName);
        } else {
            RelationFeatureFactory rff = makeRelationFeatureFactory(MachineReadingProperties.relationFeatureFactoryClass, MachineReadingProperties.relationFeatures, MachineReadingProperties.doNotLexicalizeFirstArg);
            ArgumentParser.fillOptions(rff, args);
            Annotation predicted = null;
            if (MachineReadingProperties.trainRelationsUsingPredictedEntities) {
                // generate predicted entities
                assert (entityExtractor != null);
                predicted = AnnotationUtils.deepMentionCopy(training);
                entityExtractor.annotate(predicted);
                for (ResultsPrinter rp : entityResultsPrinterSet) {
                    String msg = rp.printResults(training, predicted);
                    MachineReadingProperties.logger.info("Training relation extraction using predicted entitities: entity scores using printer " + rp.getClass() + ":\n" + msg);
                }
                // change relation mentions to use predicted entity mentions rather than gold ones
                try {
                    changeGoldRelationArgsToPredicted(predicted);
                } catch (Exception e) {
                    // we may get here for unknown EntityMentionComparator class
                    throw new RuntimeException(e);
                }
            }
            Annotation dataset;
            if (MachineReadingProperties.trainRelationsUsingPredictedEntities) {
                dataset = predicted;
            } else {
                dataset = training;
            }
            Set<String> relationsToSkip = new HashSet<>(StringUtils.split(MachineReadingProperties.relationsToSkipDuringTraining, ","));
            List<List<RelationMention>> backedUpRelations = new ArrayList<>();
            if (relationsToSkip.size() > 0) {
                // we need to backup the relations since removeSkippableRelations modifies dataset in place and we can't duplicate CoreMaps safely (or can we?)
                for (CoreMap sent : dataset.get(CoreAnnotations.SentencesAnnotation.class)) {
                    List<RelationMention> relationMentions = sent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
                    backedUpRelations.add(relationMentions);
                }
                removeSkippableRelations(dataset, relationsToSkip);
            }
            //relationExtractor = new BasicRelationExtractor(rff, MachineReadingProperties.createUnrelatedRelations, makeRelationMentionFactory(MachineReadingProperties.relationMentionFactoryClass));
            relationExtractor = makeRelationExtractor(MachineReadingProperties.relationClassifier, rff, MachineReadingProperties.createUnrelatedRelations, makeRelationMentionFactory(MachineReadingProperties.relationMentionFactoryClass));
            ArgumentParser.fillOptions(relationExtractor, args);
            //Arguments.parse(args,relationExtractor);
            MachineReadingProperties.logger.info("Training relation extraction model...");
            relationExtractor.train(dataset);
            MachineReadingProperties.logger.info("Serializing relation extraction model to " + modelName + " ...");
            relationExtractor.save(modelName);
            if (relationsToSkip.size() > 0) {
                // restore backed up relations into dataset
                int sentenceIndex = 0;
                for (CoreMap sentence : dataset.get(CoreAnnotations.SentencesAnnotation.class)) {
                    List<RelationMention> relationMentions = backedUpRelations.get(sentenceIndex);
                    sentence.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, relationMentions);
                    sentenceIndex++;
                }
            }
        }
    }
    //
    if (MachineReadingProperties.extractEvents) {
        MachineReadingProperties.logger.info("Training event extraction model(s)");
        if (partition != -1)
            MachineReadingProperties.logger.info("In partition #" + partition);
        String modelName = MachineReadingProperties.serializedEventExtractorPath;
        if (partition != -1)
            modelName += "." + partition;
        File modelFile = new File(modelName);
        if (!this.forceRetraining && modelFile.exists()) {
            MachineReadingProperties.logger.info("Loading event extraction model from " + modelName + " ...");
            Method mstLoader = (Class.forName("MSTBasedEventExtractor")).getMethod("load", String.class);
            eventExtractor = (Extractor) mstLoader.invoke(null, modelName);
        } else {
            Annotation predicted = null;
            if (MachineReadingProperties.trainEventsUsingPredictedEntities) {
                // generate predicted entities
                assert (entityExtractor != null);
                predicted = AnnotationUtils.deepMentionCopy(training);
                entityExtractor.annotate(predicted);
                for (ResultsPrinter rp : entityResultsPrinterSet) {
                    String msg = rp.printResults(training, predicted);
                    MachineReadingProperties.logger.info("Training event extraction using predicted entitities: entity scores using printer " + rp.getClass() + ":\n" + msg);
                }
            // TODO: need an equivalent of changeGoldRelationArgsToPredicted here?
            }
            Constructor<?> mstConstructor = (Class.forName("edu.stanford.nlp.ie.machinereading.MSTBasedEventExtractor")).getConstructor(boolean.class);
            eventExtractor = (Extractor) mstConstructor.newInstance(MachineReadingProperties.trainEventsUsingPredictedEntities);
            MachineReadingProperties.logger.info("Training event extraction model...");
            if (MachineReadingProperties.trainRelationsUsingPredictedEntities) {
                eventExtractor.train(predicted);
            } else {
                eventExtractor.train(training);
            }
            MachineReadingProperties.logger.info("Serializing event extraction model to " + modelName + " ...");
            eventExtractor.save(modelName);
        }
    }
}
Also used : SentencesAnnotation(edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation) HashSet(java.util.HashSet) Set(java.util.Set) Method(java.lang.reflect.Method) SentencesAnnotation(edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation) TreeAnnotation(edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation) TextAnnotation(edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation) EntityMentionsAnnotation(edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations.EntityMentionsAnnotation) TokensAnnotation(edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation) Annotation(edu.stanford.nlp.pipeline.Annotation) IOException(java.io.IOException) ArrayList(java.util.ArrayList) List(java.util.List) File(java.io.File) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 58 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.

the class RothCONLL04Reader method read.

@Override
public Annotation read(String path) throws IOException {
    Annotation doc = new Annotation("");
    logger.info("Reading file: " + path);
    // Each iteration through this loop processes a single sentence along with any relations in it
    for (Iterator<String> lineIterator = IOUtils.readLines(path).iterator(); lineIterator.hasNext(); ) {
        Annotation sentence = readSentence(path, lineIterator);
        AnnotationUtils.addSentence(doc, sentence);
    }
    return doc;
}
Also used : Annotation(edu.stanford.nlp.pipeline.Annotation)

Example 59 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.

the class RothCONLL04Reader method readSentence.

private Annotation readSentence(String docId, Iterator<String> lineIterator) {
    Annotation sentence = new Annotation("");
    sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
    sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, new ArrayList<>());
    // we'll need to set things like the tokens and textContent after we've
    // fully read the sentence
    // contains the full text that we've read so far
    StringBuilder textContent = new StringBuilder();
    // how many tokens we've seen so far
    int tokenCount = 0;
    List<CoreLabel> tokens = new ArrayList<>();
    // when we've seen two blank lines in a row, this sentence is over (one
    // blank line separates the sentence and the relations
    int numBlankLinesSeen = 0;
    String sentenceID = null;
    // keeps tracks of entities we've seen so far for use by relations
    Map<String, EntityMention> indexToEntityMention = new HashMap<>();
    while (lineIterator.hasNext() && numBlankLinesSeen < 2) {
        String currentLine = lineIterator.next();
        currentLine = currentLine.replace("COMMA", ",");
        List<String> pieces = StringUtils.split(currentLine);
        String identifier;
        int size = pieces.size();
        switch(size) {
            case // blank line between sentences or relations
            1:
                numBlankLinesSeen++;
                break;
            case // relation
            3:
                String type = pieces.get(2);
                List<ExtractionObject> args = new ArrayList<>();
                EntityMention entity1 = indexToEntityMention.get(pieces.get(0));
                EntityMention entity2 = indexToEntityMention.get(pieces.get(1));
                args.add(entity1);
                args.add(entity2);
                Span span = new Span(entity1.getExtentTokenStart(), entity2.getExtentTokenEnd());
                // identifier = "relation" + sentenceID + "-" + sentence.getAllRelations().size();
                identifier = RelationMention.makeUniqueId();
                RelationMention relationMention = new RelationMention(identifier, sentence, span, type, null, args);
                AnnotationUtils.addRelationMention(sentence, relationMention);
                break;
            case // token
            9:
                /*
         * Roth token lines look like this:
         *
         * 19 Peop 9 O NNP/NNP Jamal/Ghosheh O O O
         */
                // Entities may be multiple words joined by '/'; we split these up
                List<String> words = StringUtils.split(pieces.get(5), "/");
                //List<String> postags = StringUtils.split(pieces.get(4),"/");
                String text = StringUtils.join(words, " ");
                identifier = "entity" + pieces.get(0) + '-' + pieces.get(2);
                // entity type of the word/expression
                String nerTag = getNormalizedNERTag(pieces.get(1));
                if (sentenceID == null)
                    sentenceID = pieces.get(0);
                if (!nerTag.equals("O")) {
                    Span extentSpan = new Span(tokenCount, tokenCount + words.size());
                    // Temporarily sets the head span to equal the extent span.
                    // This is so the entity has a head (in particular, getValue() works) even if preprocessSentences isn't called.
                    // The head span is later modified if preprocessSentences is called.
                    EntityMention entity = new EntityMention(identifier, sentence, extentSpan, extentSpan, nerTag, null, null);
                    AnnotationUtils.addEntityMention(sentence, entity);
                    // we can get by using these indices as strings since we only use them
                    // as a hash key
                    String index = pieces.get(2);
                    indexToEntityMention.put(index, entity);
                }
                // int i =0;
                for (String word : words) {
                    CoreLabel label = new CoreLabel();
                    label.setWord(word);
                    //label.setTag(postags.get(i));
                    label.set(CoreAnnotations.TextAnnotation.class, word);
                    label.set(CoreAnnotations.ValueAnnotation.class, word);
                    // we don't set TokenBeginAnnotation or TokenEndAnnotation since we're
                    // not keeping track of character offsets
                    tokens.add(label);
                // i++;
                }
                textContent.append(text);
                textContent.append(' ');
                tokenCount += words.size();
                break;
        }
    }
    sentence.set(CoreAnnotations.TextAnnotation.class, textContent.toString());
    sentence.set(CoreAnnotations.ValueAnnotation.class, textContent.toString());
    sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
    sentence.set(CoreAnnotations.SentenceIDAnnotation.class, sentenceID);
    return sentence;
}
Also used : RelationMention(edu.stanford.nlp.ie.machinereading.structure.RelationMention) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Span(edu.stanford.nlp.ie.machinereading.structure.Span) Annotation(edu.stanford.nlp.pipeline.Annotation) MachineReadingAnnotations(edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations) CoreLabel(edu.stanford.nlp.ling.CoreLabel) EntityMention(edu.stanford.nlp.ie.machinereading.structure.EntityMention) ExtractionObject(edu.stanford.nlp.ie.machinereading.structure.ExtractionObject) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations)

Example 60 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.

the class AceReader method readDocument.

/**
   * Reads in a single ACE*.apf.xml file and convert it to RelationSentence
   * objects. However, you probably should call parse() instead.
   *
   * @param prefix prefix of ACE filename to read (e.g.
   *          "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01"
   *          ) (no ".apf.xml" extension)
   * @return list of RelationSentence objects
   */
private List<CoreMap> readDocument(String prefix, Annotation corpus) throws IOException, SAXException, ParserConfigurationException {
    logger.info("Reading document: " + prefix);
    List<CoreMap> results = new ArrayList<>();
    AceDocument aceDocument;
    if (aceVersion.equals("ACE2004")) {
        aceDocument = AceDocument.parseDocument(prefix, false, aceVersion);
    } else {
        aceDocument = AceDocument.parseDocument(prefix, false);
    }
    String docId = aceDocument.getId();
    // map entity mention ID strings to their EntityMention counterparts
    Map<String, EntityMention> entityMentionMap = Generics.newHashMap();
    /*
    for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
      List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
      StringBuffer b = new StringBuffer();
      for(AceToken t: tokens) b.append(t.getLiteral() + " " );
      logger.info("SENTENCE: " + b.toString());
    }
    */
    int tokenOffset = 0;
    for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
        List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
        List<CoreLabel> words = new ArrayList<>();
        StringBuilder textContent = new StringBuilder();
        for (int i = 0; i < tokens.size(); i++) {
            CoreLabel l = new CoreLabel();
            l.setWord(tokens.get(i).getLiteral());
            l.set(CoreAnnotations.ValueAnnotation.class, l.word());
            l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokens.get(i).getByteStart());
            l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokens.get(i).getByteEnd());
            words.add(l);
            if (i > 0)
                textContent.append(" ");
            textContent.append(tokens.get(i).getLiteral());
        }
        // skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer)
        if (words.size() == 1) {
            String word = words.get(0).word();
            if (word.startsWith("<") && word.endsWith(">")) {
                tokenOffset += tokens.size();
                continue;
            }
        }
        CoreMap sentence = new Annotation(textContent.toString());
        sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
        sentence.set(CoreAnnotations.TokensAnnotation.class, words);
        logger.info("Reading sentence: \"" + textContent + "\"");
        List<AceEntityMention> entityMentions = aceDocument.getEntityMentions(sentenceIndex);
        List<AceRelationMention> relationMentions = aceDocument.getRelationMentions(sentenceIndex);
        List<AceEventMention> eventMentions = aceDocument.getEventMentions(sentenceIndex);
        // convert entity mentions
        for (AceEntityMention aceEntityMention : entityMentions) {
            String corefID = "";
            for (String entityID : aceDocument.getKeySetEntities()) {
                AceEntity e = aceDocument.getEntity(entityID);
                if (e.getMentions().contains(aceEntityMention)) {
                    corefID = entityID;
                    break;
                }
            }
            EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID);
            //        EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset);
            entityCounts.incrementCount(convertedMention.getType());
            logger.info("CONVERTED MENTION HEAD SPAN: " + convertedMention.getHead());
            logger.info("CONVERTED ENTITY MENTION: " + convertedMention);
            AnnotationUtils.addEntityMention(sentence, convertedMention);
            entityMentionMap.put(aceEntityMention.getId(), convertedMention);
        // TODO: make Entity objects as needed
        }
        // convert relation mentions
        for (AceRelationMention aceRelationMention : relationMentions) {
            RelationMention convertedMention = convertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap);
            if (convertedMention != null) {
                relationCounts.incrementCount(convertedMention.getType());
                logger.info("CONVERTED RELATION MENTION: " + convertedMention);
                AnnotationUtils.addRelationMention(sentence, convertedMention);
            }
        // TODO: make Relation objects
        }
        // convert EventMentions
        for (AceEventMention aceEventMention : eventMentions) {
            EventMention convertedMention = convertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset);
            if (convertedMention != null) {
                eventCounts.incrementCount(convertedMention.getType());
                logger.info("CONVERTED EVENT MENTION: " + convertedMention);
                AnnotationUtils.addEventMention(sentence, convertedMention);
            }
        // TODO: make Event objects
        }
        results.add(sentence);
        tokenOffset += tokens.size();
    }
    return results;
}
Also used : EventMention(edu.stanford.nlp.ie.machinereading.structure.EventMention) AceEventMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEventMention) AceRelationMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMention) RelationMention(edu.stanford.nlp.ie.machinereading.structure.RelationMention) ArrayList(java.util.ArrayList) AceEntity(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntity) AceEventMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEventMention) Annotation(edu.stanford.nlp.pipeline.Annotation) AceDocument(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceDocument) CoreLabel(edu.stanford.nlp.ling.CoreLabel) EntityMention(edu.stanford.nlp.ie.machinereading.structure.EntityMention) AceEntityMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention) AceToken(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceToken) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) AceRelationMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMention) AceEntityMention(edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention) CoreMap(edu.stanford.nlp.util.CoreMap)

Aggregations

Annotation (edu.stanford.nlp.pipeline.Annotation)91 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)58 CoreMap (edu.stanford.nlp.util.CoreMap)50 CoreLabel (edu.stanford.nlp.ling.CoreLabel)30 StanfordCoreNLP (edu.stanford.nlp.pipeline.StanfordCoreNLP)27 ArrayList (java.util.ArrayList)25 Properties (java.util.Properties)25 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)19 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)14 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)13 SentencesAnnotation (edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation)12 TreeAnnotation (edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation)12 List (java.util.List)11 Tree (edu.stanford.nlp.trees.Tree)10 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)8 IOException (java.io.IOException)8 TokensAnnotation (edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation)7 CorefChain (edu.stanford.nlp.coref.data.CorefChain)6 EntityMentionsAnnotation (edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations.EntityMentionsAnnotation)6 CoreAnnotation (edu.stanford.nlp.ling.CoreAnnotation)6