use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class MachineReading method annotate.
protected Annotation annotate(Annotation testing, int partition) {
int partitionIndex = (partition != -1 ? partition : 0);
//
if (MachineReadingProperties.extractEntities) {
assert (entityExtractor != null);
Annotation predicted = AnnotationUtils.deepMentionCopy(testing);
entityExtractor.annotate(predicted);
for (ResultsPrinter rp : entityResultsPrinterSet) {
String msg = rp.printResults(testing, predicted);
MachineReadingProperties.logger.info("Entity extraction results " + (partition != -1 ? "for partition #" + partition : "") + " using printer " + rp.getClass() + ":\n" + msg);
}
predictions[ENTITY_LEVEL][partitionIndex] = predicted;
}
//
if (MachineReadingProperties.extractRelations) {
assert (relationExtractor != null);
Annotation predicted = (MachineReadingProperties.testRelationsUsingPredictedEntities ? predictions[ENTITY_LEVEL][partitionIndex] : AnnotationUtils.deepMentionCopy(testing));
// make sure the entities have the syntactic head and span set. we need this for relation extraction features
// TODO(AngledLuffa): this call to assignSyntacticHeadToEntities
// is changing the annotations for the original annotation.
// This is probably not right? It can result in changes in the
// dependencies when run in the pipeline. For example:
// "They are such as interested Thomas Aquinas and Bonaventura, Anselm and Bernard."
// https://github.com/stanfordnlp/CoreNLP/issues/1053
assignSyntacticHeadToEntities(predicted);
relationExtractor.annotate(predicted);
if (relationExtractionPostProcessor == null) {
relationExtractionPostProcessor = makeExtractor(MachineReadingProperties.relationExtractionPostProcessorClass);
}
if (relationExtractionPostProcessor != null) {
MachineReadingProperties.logger.info("Using relation extraction post processor: " + MachineReadingProperties.relationExtractionPostProcessorClass);
relationExtractionPostProcessor.annotate(predicted);
}
for (ResultsPrinter rp : getRelationResultsPrinterSet()) {
String msg = rp.printResults(testing, predicted);
MachineReadingProperties.logger.info("Relation extraction results " + (partition != -1 ? "for partition #" + partition : "") + " using printer " + rp.getClass() + ":\n" + msg);
}
//
if (consistencyChecker == null) {
consistencyChecker = makeExtractor(MachineReadingProperties.consistencyCheck);
}
if (consistencyChecker != null) {
MachineReadingProperties.logger.info("Using consistency checker: " + MachineReadingProperties.consistencyCheck);
consistencyChecker.annotate(predicted);
for (ResultsPrinter rp : entityResultsPrinterSet) {
String msg = rp.printResults(testing, predicted);
MachineReadingProperties.logger.info("Entity extraction results AFTER consistency checks " + (partition != -1 ? "for partition #" + partition : "") + " using printer " + rp.getClass() + ":\n" + msg);
}
for (ResultsPrinter rp : getRelationResultsPrinterSet()) {
String msg = rp.printResults(testing, predicted);
MachineReadingProperties.logger.info("Relation extraction results AFTER consistency checks " + (partition != -1 ? "for partition #" + partition : "") + " using printer " + rp.getClass() + ":\n" + msg);
}
}
predictions[RELATION_LEVEL][partitionIndex] = predicted;
}
return predictions[RELATION_LEVEL][partitionIndex];
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class MachineReading method run.
/**
* Performs extraction. This will train a new extraction model and evaluate
* the model on the test set. Depending on the MachineReading instance's
* parameters, it may skip training if a model already exists or skip
* evaluation.
*
* returns results string, can be compared in a utest
*/
public List<String> run() throws Exception {
this.forceRetraining = !MachineReadingProperties.loadModel;
if (MachineReadingProperties.trainOnly) {
this.forceRetraining = true;
}
List<String> retMsg = new ArrayList<>();
boolean haveSerializedEntityExtractor = serializedModelExists(MachineReadingProperties.serializedEntityExtractorPath);
boolean haveSerializedRelationExtractor = serializedModelExists(MachineReadingProperties.serializedRelationExtractorPath);
boolean haveSerializedEventExtractor = serializedModelExists(MachineReadingProperties.serializedEventExtractorPath);
Annotation training = null;
Annotation aux = null;
if ((MachineReadingProperties.extractEntities && !haveSerializedEntityExtractor) || (MachineReadingProperties.extractRelations && !haveSerializedRelationExtractor) || (MachineReadingProperties.extractEvents && !haveSerializedEventExtractor) || this.forceRetraining || MachineReadingProperties.crossValidate) {
// load training sentences
training = loadOrMakeSerializedSentences(MachineReadingProperties.trainPath, reader, new File(MachineReadingProperties.serializedTrainingSentencesPath));
if (auxReader != null) {
MachineReadingProperties.logger.severe("Reading auxiliary dataset from " + MachineReadingProperties.auxDataPath + "...");
aux = loadOrMakeSerializedSentences(MachineReadingProperties.auxDataPath, auxReader, new File(MachineReadingProperties.serializedAuxTrainingSentencesPath));
MachineReadingProperties.logger.severe("Done reading auxiliary dataset.");
}
}
Annotation testing = null;
if (!MachineReadingProperties.trainOnly && !MachineReadingProperties.crossValidate) {
// load test sentences
File serializedTestSentences = new File(MachineReadingProperties.serializedTestSentencesPath);
testing = loadOrMakeSerializedSentences(MachineReadingProperties.testPath, reader, serializedTestSentences);
}
//
// create the actual datasets to be used for training and annotation
//
makeDataSets(training, testing, aux);
//
for (int partition = 0; partition < datasets.length; partition++) {
assert (datasets.length > partition);
assert (datasets[partition] != null);
assert (MachineReadingProperties.trainOnly || datasets[partition].second() != null);
// train all models
train(datasets[partition].first(), (MachineReadingProperties.crossValidate ? partition : -1));
// annotate using all models
if (!MachineReadingProperties.trainOnly) {
MachineReadingProperties.logger.info("annotating partition " + partition);
annotate(datasets[partition].second(), (MachineReadingProperties.crossValidate ? partition : -1));
}
}
//
if (!MachineReadingProperties.trainOnly) {
// merge test sets for the gold data
Annotation gold = new Annotation("");
for (Pair<Annotation, Annotation> dataset : datasets) AnnotationUtils.addSentences(gold, dataset.second().get(SentencesAnnotation.class));
// merge test sets with predicted annotations
Annotation[] mergedPredictions = new Annotation[3];
assert (predictions != null);
for (int taskLevel = 0; taskLevel < mergedPredictions.length; taskLevel++) {
mergedPredictions[taskLevel] = new Annotation("");
for (int fold = 0; fold < predictions[taskLevel].length; fold++) {
if (predictions[taskLevel][fold] == null)
continue;
AnnotationUtils.addSentences(mergedPredictions[taskLevel], predictions[taskLevel][fold].get(CoreAnnotations.SentencesAnnotation.class));
}
}
//
if (MachineReadingProperties.extractEntities && !entityResultsPrinterSet.isEmpty()) {
retMsg.addAll(printTask("entity extraction", entityResultsPrinterSet, gold, mergedPredictions[ENTITY_LEVEL]));
}
if (MachineReadingProperties.extractRelations && !getRelationResultsPrinterSet().isEmpty()) {
retMsg.addAll(printTask("relation extraction", getRelationResultsPrinterSet(), gold, mergedPredictions[RELATION_LEVEL]));
}
//
if (MachineReadingProperties.extractEntities && MachineReadingProperties.serializedEntityExtractionResults != null)
IOUtils.writeObjectToFile(mergedPredictions[ENTITY_LEVEL], MachineReadingProperties.serializedEntityExtractionResults);
if (MachineReadingProperties.extractRelations && MachineReadingProperties.serializedRelationExtractionResults != null)
IOUtils.writeObjectToFile(mergedPredictions[RELATION_LEVEL], MachineReadingProperties.serializedRelationExtractionResults);
if (MachineReadingProperties.extractEvents && MachineReadingProperties.serializedEventExtractionResults != null)
IOUtils.writeObjectToFile(mergedPredictions[EVENT_LEVEL], MachineReadingProperties.serializedEventExtractionResults);
}
return retMsg;
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class MachineReading method loadOrMakeSerializedSentences.
/**
* Gets the serialized sentences for a data set. If the serialized sentences
* are already on disk, it loads them from there. Otherwise, the data set is
* read with the corpus reader and the serialized sentences are saved to disk.
*
* @param sentencesPath Llocation of the raw data set
* @param reader The corpus reader
* @param serializedSentences Where the serialized sentences should be stored on disk
* @return A list of RelationsSentences
*/
private Annotation loadOrMakeSerializedSentences(String sentencesPath, GenericDataSetReader reader, File serializedSentences) throws IOException, ClassNotFoundException {
Annotation corpusSentences;
// and and save the serialized file to disk
if (MachineReadingProperties.serializeCorpora && serializedSentences.exists() && !forceParseSentences) {
MachineReadingProperties.logger.info("Loaded serialized sentences from " + serializedSentences.getAbsolutePath() + "...");
corpusSentences = IOUtils.readObjectFromFile(serializedSentences);
MachineReadingProperties.logger.info("Done. Loaded " + corpusSentences.get(CoreAnnotations.SentencesAnnotation.class).size() + " sentences.");
} else {
// read the corpus
MachineReadingProperties.logger.info("Parsing corpus sentences...");
if (MachineReadingProperties.serializeCorpora)
MachineReadingProperties.logger.info("These sentences will be serialized to " + serializedSentences.getAbsolutePath());
corpusSentences = reader.parse(sentencesPath);
MachineReadingProperties.logger.info("Done. Parsed " + AnnotationUtils.sentenceCount(corpusSentences) + " sentences.");
// save corpusSentences
if (MachineReadingProperties.serializeCorpora) {
MachineReadingProperties.logger.info("Serializing parsed sentences to " + serializedSentences.getAbsolutePath() + "...");
IOUtils.writeObjectToFile(corpusSentences, serializedSentences);
MachineReadingProperties.logger.info("Done. Serialized " + AnnotationUtils.sentenceCount(corpusSentences) + " sentences.");
}
}
return corpusSentences;
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class AceReader method read.
/**
* Reads in ACE*.apf.xml files and converts them to RelationSentence objects.
* Note that you probably should call parse() instead.
*
* Currently, this ignores document boundaries (the list returned will include
* sentences from all documents).
*
* @param path directory containing ACE files to read (e.g.
* "/home/mcclosky/scr/data/ACE2005/english_test"). This can also be
* the path to a single file. *
* @return list of RelationSentence objects
*/
@Override
public Annotation read(String path) throws IOException, SAXException, ParserConfigurationException {
List<CoreMap> allSentences = new ArrayList<>();
File basePath = new File(path);
assert basePath.exists();
Annotation corpus = new Annotation("");
if (basePath.isDirectory()) {
for (File aceFile : IOUtils.iterFilesRecursive(basePath, ".apf.xml")) {
if (aceFile.getName().endsWith(".UPC1.apf.xml")) {
continue;
}
allSentences.addAll(readDocument(aceFile, corpus));
}
} else {
// in case it's a file
allSentences.addAll(readDocument(basePath, corpus));
}
AnnotationUtils.addSentences(corpus, allSentences);
// quick stats
if (VERBOSE) {
printCounter(entityCounts, "entity mention");
printCounter(relationCounts, "relation mention");
printCounter(eventCounts, "event mention");
}
for (CoreMap sent : allSentences) {
// check for entity mentions of the same type that are adjacent
countAdjacentMentions(sent);
// count relations between two proper nouns
countNameRelations(sent);
// count types of mentions
countMentionTypes(sent);
}
if (VERBOSE) {
printCounter(adjacentEntityMentions, "adjacent entity mention");
printCounter(nameRelationCounts, "name relation mention");
printCounter(mentionTypeCounts, "mention type counts");
}
return corpus;
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class DependencyParserCoreNLPDemo method main.
public static void main(String[] args) {
String text;
if (args.length > 0) {
text = IOUtils.slurpFileNoExceptions(args[0], "utf-8");
} else {
text = "I can almost always tell when movies use fake dinosaurs.";
}
Annotation ann = new Annotation(text);
Properties props = PropertiesUtils.asProperties("annotators", "tokenize,ssplit,pos,depparse", "depparse.model", DependencyParser.DEFAULT_MODEL);
AnnotationPipeline pipeline = new StanfordCoreNLP(props);
pipeline.annotate(ann);
for (CoreMap sent : ann.get(CoreAnnotations.SentencesAnnotation.class)) {
SemanticGraph sg = sent.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
log.info(IOUtils.eolChar + sg.toString(SemanticGraph.OutputFormat.LIST));
}
}
Aggregations