use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class MachineReading method keepPercentage.
/** Keeps only the first percentage sentences from the given corpus */
private static Annotation keepPercentage(Annotation corpus, double percentage) {
log.info("Using fraction of train: " + percentage);
if (percentage >= 1.0) {
return corpus;
}
Annotation smaller = new Annotation("");
List<CoreMap> sents = new ArrayList<>();
List<CoreMap> fullSents = corpus.get(SentencesAnnotation.class);
double smallSize = (double) fullSents.size() * percentage;
for (int i = 0; i < smallSize; i++) {
sents.add(fullSents.get(i));
}
log.info("TRAIN corpus size reduced from " + fullSents.size() + " to " + sents.size());
smaller.set(SentencesAnnotation.class, sents);
return smaller;
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class MachineReading method train.
protected void train(Annotation training, int partition) throws Exception {
//
if (MachineReadingProperties.extractEntities) {
MachineReadingProperties.logger.info("Training entity extraction model(s)");
if (partition != -1)
MachineReadingProperties.logger.info("In partition #" + partition);
String modelName = MachineReadingProperties.serializedEntityExtractorPath;
if (partition != -1)
modelName += "." + partition;
File modelFile = new File(modelName);
MachineReadingProperties.logger.fine("forceRetraining = " + this.forceRetraining + ", modelFile.exists = " + modelFile.exists());
if (!this.forceRetraining && modelFile.exists()) {
MachineReadingProperties.logger.info("Loading entity extraction model from " + modelName + " ...");
entityExtractor = BasicEntityExtractor.load(modelName, MachineReadingProperties.entityClassifier, false);
} else {
MachineReadingProperties.logger.info("Training entity extraction model...");
entityExtractor = makeEntityExtractor(MachineReadingProperties.entityClassifier, MachineReadingProperties.entityGazetteerPath);
entityExtractor.train(training);
MachineReadingProperties.logger.info("Serializing entity extraction model to " + modelName + " ...");
entityExtractor.save(modelName);
}
}
//
if (MachineReadingProperties.extractRelations) {
MachineReadingProperties.logger.info("Training relation extraction model(s)");
if (partition != -1)
MachineReadingProperties.logger.info("In partition #" + partition);
String modelName = MachineReadingProperties.serializedRelationExtractorPath;
if (partition != -1)
modelName += "." + partition;
if (MachineReadingProperties.useRelationExtractionModelMerging) {
String[] modelNames = MachineReadingProperties.serializedRelationExtractorPath.split(",");
if (partition != -1) {
for (int i = 0; i < modelNames.length; i++) {
modelNames[i] += "." + partition;
}
}
relationExtractor = ExtractorMerger.buildRelationExtractorMerger(modelNames);
} else if (!this.forceRetraining && new File(modelName).exists()) {
MachineReadingProperties.logger.info("Loading relation extraction model from " + modelName + " ...");
//TODO change this to load any type of BasicRelationExtractor
relationExtractor = BasicRelationExtractor.load(modelName);
} else {
RelationFeatureFactory rff = makeRelationFeatureFactory(MachineReadingProperties.relationFeatureFactoryClass, MachineReadingProperties.relationFeatures, MachineReadingProperties.doNotLexicalizeFirstArg);
ArgumentParser.fillOptions(rff, args);
Annotation predicted = null;
if (MachineReadingProperties.trainRelationsUsingPredictedEntities) {
// generate predicted entities
assert (entityExtractor != null);
predicted = AnnotationUtils.deepMentionCopy(training);
entityExtractor.annotate(predicted);
for (ResultsPrinter rp : entityResultsPrinterSet) {
String msg = rp.printResults(training, predicted);
MachineReadingProperties.logger.info("Training relation extraction using predicted entitities: entity scores using printer " + rp.getClass() + ":\n" + msg);
}
// change relation mentions to use predicted entity mentions rather than gold ones
try {
changeGoldRelationArgsToPredicted(predicted);
} catch (Exception e) {
// we may get here for unknown EntityMentionComparator class
throw new RuntimeException(e);
}
}
Annotation dataset;
if (MachineReadingProperties.trainRelationsUsingPredictedEntities) {
dataset = predicted;
} else {
dataset = training;
}
Set<String> relationsToSkip = new HashSet<>(StringUtils.split(MachineReadingProperties.relationsToSkipDuringTraining, ","));
List<List<RelationMention>> backedUpRelations = new ArrayList<>();
if (relationsToSkip.size() > 0) {
// we need to backup the relations since removeSkippableRelations modifies dataset in place and we can't duplicate CoreMaps safely (or can we?)
for (CoreMap sent : dataset.get(CoreAnnotations.SentencesAnnotation.class)) {
List<RelationMention> relationMentions = sent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
backedUpRelations.add(relationMentions);
}
removeSkippableRelations(dataset, relationsToSkip);
}
//relationExtractor = new BasicRelationExtractor(rff, MachineReadingProperties.createUnrelatedRelations, makeRelationMentionFactory(MachineReadingProperties.relationMentionFactoryClass));
relationExtractor = makeRelationExtractor(MachineReadingProperties.relationClassifier, rff, MachineReadingProperties.createUnrelatedRelations, makeRelationMentionFactory(MachineReadingProperties.relationMentionFactoryClass));
ArgumentParser.fillOptions(relationExtractor, args);
//Arguments.parse(args,relationExtractor);
MachineReadingProperties.logger.info("Training relation extraction model...");
relationExtractor.train(dataset);
MachineReadingProperties.logger.info("Serializing relation extraction model to " + modelName + " ...");
relationExtractor.save(modelName);
if (relationsToSkip.size() > 0) {
// restore backed up relations into dataset
int sentenceIndex = 0;
for (CoreMap sentence : dataset.get(CoreAnnotations.SentencesAnnotation.class)) {
List<RelationMention> relationMentions = backedUpRelations.get(sentenceIndex);
sentence.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, relationMentions);
sentenceIndex++;
}
}
}
}
//
if (MachineReadingProperties.extractEvents) {
MachineReadingProperties.logger.info("Training event extraction model(s)");
if (partition != -1)
MachineReadingProperties.logger.info("In partition #" + partition);
String modelName = MachineReadingProperties.serializedEventExtractorPath;
if (partition != -1)
modelName += "." + partition;
File modelFile = new File(modelName);
if (!this.forceRetraining && modelFile.exists()) {
MachineReadingProperties.logger.info("Loading event extraction model from " + modelName + " ...");
Method mstLoader = (Class.forName("MSTBasedEventExtractor")).getMethod("load", String.class);
eventExtractor = (Extractor) mstLoader.invoke(null, modelName);
} else {
Annotation predicted = null;
if (MachineReadingProperties.trainEventsUsingPredictedEntities) {
// generate predicted entities
assert (entityExtractor != null);
predicted = AnnotationUtils.deepMentionCopy(training);
entityExtractor.annotate(predicted);
for (ResultsPrinter rp : entityResultsPrinterSet) {
String msg = rp.printResults(training, predicted);
MachineReadingProperties.logger.info("Training event extraction using predicted entitities: entity scores using printer " + rp.getClass() + ":\n" + msg);
}
// TODO: need an equivalent of changeGoldRelationArgsToPredicted here?
}
Constructor<?> mstConstructor = (Class.forName("edu.stanford.nlp.ie.machinereading.MSTBasedEventExtractor")).getConstructor(boolean.class);
eventExtractor = (Extractor) mstConstructor.newInstance(MachineReadingProperties.trainEventsUsingPredictedEntities);
MachineReadingProperties.logger.info("Training event extraction model...");
if (MachineReadingProperties.trainRelationsUsingPredictedEntities) {
eventExtractor.train(predicted);
} else {
eventExtractor.train(training);
}
MachineReadingProperties.logger.info("Serializing event extraction model to " + modelName + " ...");
eventExtractor.save(modelName);
}
}
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class RothCONLL04Reader method read.
@Override
public Annotation read(String path) throws IOException {
Annotation doc = new Annotation("");
logger.info("Reading file: " + path);
// Each iteration through this loop processes a single sentence along with any relations in it
for (Iterator<String> lineIterator = IOUtils.readLines(path).iterator(); lineIterator.hasNext(); ) {
Annotation sentence = readSentence(path, lineIterator);
AnnotationUtils.addSentence(doc, sentence);
}
return doc;
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class RothCONLL04Reader method readSentence.
private Annotation readSentence(String docId, Iterator<String> lineIterator) {
Annotation sentence = new Annotation("");
sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, new ArrayList<>());
// we'll need to set things like the tokens and textContent after we've
// fully read the sentence
// contains the full text that we've read so far
StringBuilder textContent = new StringBuilder();
// how many tokens we've seen so far
int tokenCount = 0;
List<CoreLabel> tokens = new ArrayList<>();
// when we've seen two blank lines in a row, this sentence is over (one
// blank line separates the sentence and the relations
int numBlankLinesSeen = 0;
String sentenceID = null;
// keeps tracks of entities we've seen so far for use by relations
Map<String, EntityMention> indexToEntityMention = new HashMap<>();
while (lineIterator.hasNext() && numBlankLinesSeen < 2) {
String currentLine = lineIterator.next();
currentLine = currentLine.replace("COMMA", ",");
List<String> pieces = StringUtils.split(currentLine);
String identifier;
int size = pieces.size();
switch(size) {
case // blank line between sentences or relations
1:
numBlankLinesSeen++;
break;
case // relation
3:
String type = pieces.get(2);
List<ExtractionObject> args = new ArrayList<>();
EntityMention entity1 = indexToEntityMention.get(pieces.get(0));
EntityMention entity2 = indexToEntityMention.get(pieces.get(1));
args.add(entity1);
args.add(entity2);
Span span = new Span(entity1.getExtentTokenStart(), entity2.getExtentTokenEnd());
// identifier = "relation" + sentenceID + "-" + sentence.getAllRelations().size();
identifier = RelationMention.makeUniqueId();
RelationMention relationMention = new RelationMention(identifier, sentence, span, type, null, args);
AnnotationUtils.addRelationMention(sentence, relationMention);
break;
case // token
9:
/*
* Roth token lines look like this:
*
* 19 Peop 9 O NNP/NNP Jamal/Ghosheh O O O
*/
// Entities may be multiple words joined by '/'; we split these up
List<String> words = StringUtils.split(pieces.get(5), "/");
//List<String> postags = StringUtils.split(pieces.get(4),"/");
String text = StringUtils.join(words, " ");
identifier = "entity" + pieces.get(0) + '-' + pieces.get(2);
// entity type of the word/expression
String nerTag = getNormalizedNERTag(pieces.get(1));
if (sentenceID == null)
sentenceID = pieces.get(0);
if (!nerTag.equals("O")) {
Span extentSpan = new Span(tokenCount, tokenCount + words.size());
// Temporarily sets the head span to equal the extent span.
// This is so the entity has a head (in particular, getValue() works) even if preprocessSentences isn't called.
// The head span is later modified if preprocessSentences is called.
EntityMention entity = new EntityMention(identifier, sentence, extentSpan, extentSpan, nerTag, null, null);
AnnotationUtils.addEntityMention(sentence, entity);
// we can get by using these indices as strings since we only use them
// as a hash key
String index = pieces.get(2);
indexToEntityMention.put(index, entity);
}
// int i =0;
for (String word : words) {
CoreLabel label = new CoreLabel();
label.setWord(word);
//label.setTag(postags.get(i));
label.set(CoreAnnotations.TextAnnotation.class, word);
label.set(CoreAnnotations.ValueAnnotation.class, word);
// we don't set TokenBeginAnnotation or TokenEndAnnotation since we're
// not keeping track of character offsets
tokens.add(label);
// i++;
}
textContent.append(text);
textContent.append(' ');
tokenCount += words.size();
break;
}
}
sentence.set(CoreAnnotations.TextAnnotation.class, textContent.toString());
sentence.set(CoreAnnotations.ValueAnnotation.class, textContent.toString());
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
sentence.set(CoreAnnotations.SentenceIDAnnotation.class, sentenceID);
return sentence;
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class AceReader method readDocument.
/**
* Reads in a single ACE*.apf.xml file and convert it to RelationSentence
* objects. However, you probably should call parse() instead.
*
* @param prefix prefix of ACE filename to read (e.g.
* "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01"
* ) (no ".apf.xml" extension)
* @return list of RelationSentence objects
*/
private List<CoreMap> readDocument(String prefix, Annotation corpus) throws IOException, SAXException, ParserConfigurationException {
logger.info("Reading document: " + prefix);
List<CoreMap> results = new ArrayList<>();
AceDocument aceDocument;
if (aceVersion.equals("ACE2004")) {
aceDocument = AceDocument.parseDocument(prefix, false, aceVersion);
} else {
aceDocument = AceDocument.parseDocument(prefix, false);
}
String docId = aceDocument.getId();
// map entity mention ID strings to their EntityMention counterparts
Map<String, EntityMention> entityMentionMap = Generics.newHashMap();
/*
for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
StringBuffer b = new StringBuffer();
for(AceToken t: tokens) b.append(t.getLiteral() + " " );
logger.info("SENTENCE: " + b.toString());
}
*/
int tokenOffset = 0;
for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
List<CoreLabel> words = new ArrayList<>();
StringBuilder textContent = new StringBuilder();
for (int i = 0; i < tokens.size(); i++) {
CoreLabel l = new CoreLabel();
l.setWord(tokens.get(i).getLiteral());
l.set(CoreAnnotations.ValueAnnotation.class, l.word());
l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokens.get(i).getByteStart());
l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokens.get(i).getByteEnd());
words.add(l);
if (i > 0)
textContent.append(" ");
textContent.append(tokens.get(i).getLiteral());
}
// skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer)
if (words.size() == 1) {
String word = words.get(0).word();
if (word.startsWith("<") && word.endsWith(">")) {
tokenOffset += tokens.size();
continue;
}
}
CoreMap sentence = new Annotation(textContent.toString());
sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
sentence.set(CoreAnnotations.TokensAnnotation.class, words);
logger.info("Reading sentence: \"" + textContent + "\"");
List<AceEntityMention> entityMentions = aceDocument.getEntityMentions(sentenceIndex);
List<AceRelationMention> relationMentions = aceDocument.getRelationMentions(sentenceIndex);
List<AceEventMention> eventMentions = aceDocument.getEventMentions(sentenceIndex);
// convert entity mentions
for (AceEntityMention aceEntityMention : entityMentions) {
String corefID = "";
for (String entityID : aceDocument.getKeySetEntities()) {
AceEntity e = aceDocument.getEntity(entityID);
if (e.getMentions().contains(aceEntityMention)) {
corefID = entityID;
break;
}
}
EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID);
// EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset);
entityCounts.incrementCount(convertedMention.getType());
logger.info("CONVERTED MENTION HEAD SPAN: " + convertedMention.getHead());
logger.info("CONVERTED ENTITY MENTION: " + convertedMention);
AnnotationUtils.addEntityMention(sentence, convertedMention);
entityMentionMap.put(aceEntityMention.getId(), convertedMention);
// TODO: make Entity objects as needed
}
// convert relation mentions
for (AceRelationMention aceRelationMention : relationMentions) {
RelationMention convertedMention = convertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap);
if (convertedMention != null) {
relationCounts.incrementCount(convertedMention.getType());
logger.info("CONVERTED RELATION MENTION: " + convertedMention);
AnnotationUtils.addRelationMention(sentence, convertedMention);
}
// TODO: make Relation objects
}
// convert EventMentions
for (AceEventMention aceEventMention : eventMentions) {
EventMention convertedMention = convertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset);
if (convertedMention != null) {
eventCounts.incrementCount(convertedMention.getType());
logger.info("CONVERTED EVENT MENTION: " + convertedMention);
AnnotationUtils.addEventMention(sentence, convertedMention);
}
// TODO: make Event objects
}
results.add(sentence);
tokenOffset += tokens.size();
}
return results;
}
Aggregations