use of edu.stanford.nlp.ie.machinereading.structure.EntityMention in project CoreNLP by stanfordnlp.
the class GenericDataSetReader method preProcessSentences.
/**
* Take a dataset Annotation, generate their parse trees and identify syntactic heads (and head spans, if necessary)
*/
public void preProcessSentences(Annotation dataset) {
logger.severe("GenericDataSetReader: Started pre-processing the corpus...");
// run the processor, i.e., NER, parse etc.
if (processor != null) {
// we might already have syntactic annotation from offline files
List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);
if (sentences.size() > 0 && !sentences.get(0).containsKey(TreeCoreAnnotations.TreeAnnotation.class)) {
logger.info("Annotating dataset with " + processor);
processor.annotate(dataset);
} else {
logger.info("Found existing syntactic annotations. Will not use the NLP processor.");
}
}
/*
List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);
for(int i = 0; i < sentences.size(); i ++){
CoreMap sent = sentences.get(i);
List<CoreLabel> tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
logger.info("Tokens for sentence #" + i + ": " + tokens);
logger.info("Parse tree for sentence #" + i + ": " + sent.get(TreeCoreAnnotations.TreeAnnotation.class).pennString());
}
*/
List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);
logger.fine("Extracted " + sentences.size() + " sentences.");
for (CoreMap sentence : sentences) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
logger.fine("Processing sentence " + tokens);
Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
if (tree == null)
throw new RuntimeException("ERROR: MR requires full syntactic analysis!");
// convert tree labels to CoreLabel if necessary
// we need this because we store additional info in the CoreLabel, such as the spans of each tree
convertToCoreLabels(tree);
// store the tree spans, if not present already
CoreLabel l = (CoreLabel) tree.label();
if (forceGenerationOfIndexSpans || (!l.containsKey(CoreAnnotations.BeginIndexAnnotation.class) && !l.containsKey(CoreAnnotations.EndIndexAnnotation.class))) {
tree.indexSpans(0);
logger.fine("Index spans were generated.");
} else {
logger.fine("Index spans were NOT generated.");
}
logger.fine("Parse tree using CoreLabel:\n" + tree.pennString());
//
if (sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class) != null) {
for (EntityMention ent : sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class)) {
logger.fine("Finding head for entity: " + ent);
int headPos = assignSyntacticHead(ent, tree, tokens, calculateHeadSpan);
logger.fine("Syntactic head of mention \"" + ent + "\" is: " + tokens.get(headPos).word());
assert (ent.getExtent() != null);
assert (ent.getHead() != null);
assert (ent.getSyntacticHeadTokenPosition() >= 0);
}
}
}
logger.severe("GenericDataSetReader: Pre-processing complete.");
}
use of edu.stanford.nlp.ie.machinereading.structure.EntityMention in project CoreNLP by stanfordnlp.
the class GenericDataSetReader method modifyUsingCoreNLPNER.
private void modifyUsingCoreNLPNER(Annotation doc) {
Properties ann = new Properties();
ann.setProperty("annotators", "pos, lemma, ner");
StanfordCoreNLP pipeline = new StanfordCoreNLP(ann, false);
pipeline.annotate(doc);
for (CoreMap sentence : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
List<EntityMention> entities = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
if (entities != null) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
for (EntityMention en : entities) {
//System.out.println("old ner tag for " + en.getExtentString() + " was " + en.getType());
Span s = en.getExtent();
Counter<String> allNertagforSpan = new ClassicCounter<>();
for (int i = s.start(); i < s.end(); i++) {
allNertagforSpan.incrementCount(tokens.get(i).ner());
}
String entityNertag = Counters.argmax(allNertagforSpan);
en.setType(entityNertag);
//System.out.println("new ner tag is " + entityNertag);
}
}
}
}
use of edu.stanford.nlp.ie.machinereading.structure.EntityMention in project CoreNLP by stanfordnlp.
the class RothCONLL04Reader method readSentence.
private Annotation readSentence(String docId, Iterator<String> lineIterator) {
Annotation sentence = new Annotation("");
sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, new ArrayList<>());
// we'll need to set things like the tokens and textContent after we've
// fully read the sentence
// contains the full text that we've read so far
StringBuilder textContent = new StringBuilder();
// how many tokens we've seen so far
int tokenCount = 0;
List<CoreLabel> tokens = new ArrayList<>();
// when we've seen two blank lines in a row, this sentence is over (one
// blank line separates the sentence and the relations
int numBlankLinesSeen = 0;
String sentenceID = null;
// keeps tracks of entities we've seen so far for use by relations
Map<String, EntityMention> indexToEntityMention = new HashMap<>();
while (lineIterator.hasNext() && numBlankLinesSeen < 2) {
String currentLine = lineIterator.next();
currentLine = currentLine.replace("COMMA", ",");
List<String> pieces = StringUtils.split(currentLine);
String identifier;
int size = pieces.size();
switch(size) {
case // blank line between sentences or relations
1:
numBlankLinesSeen++;
break;
case // relation
3:
String type = pieces.get(2);
List<ExtractionObject> args = new ArrayList<>();
EntityMention entity1 = indexToEntityMention.get(pieces.get(0));
EntityMention entity2 = indexToEntityMention.get(pieces.get(1));
args.add(entity1);
args.add(entity2);
Span span = new Span(entity1.getExtentTokenStart(), entity2.getExtentTokenEnd());
// identifier = "relation" + sentenceID + "-" + sentence.getAllRelations().size();
identifier = RelationMention.makeUniqueId();
RelationMention relationMention = new RelationMention(identifier, sentence, span, type, null, args);
AnnotationUtils.addRelationMention(sentence, relationMention);
break;
case // token
9:
/*
* Roth token lines look like this:
*
* 19 Peop 9 O NNP/NNP Jamal/Ghosheh O O O
*/
// Entities may be multiple words joined by '/'; we split these up
List<String> words = StringUtils.split(pieces.get(5), "/");
//List<String> postags = StringUtils.split(pieces.get(4),"/");
String text = StringUtils.join(words, " ");
identifier = "entity" + pieces.get(0) + '-' + pieces.get(2);
// entity type of the word/expression
String nerTag = getNormalizedNERTag(pieces.get(1));
if (sentenceID == null)
sentenceID = pieces.get(0);
if (!nerTag.equals("O")) {
Span extentSpan = new Span(tokenCount, tokenCount + words.size());
// Temporarily sets the head span to equal the extent span.
// This is so the entity has a head (in particular, getValue() works) even if preprocessSentences isn't called.
// The head span is later modified if preprocessSentences is called.
EntityMention entity = new EntityMention(identifier, sentence, extentSpan, extentSpan, nerTag, null, null);
AnnotationUtils.addEntityMention(sentence, entity);
// we can get by using these indices as strings since we only use them
// as a hash key
String index = pieces.get(2);
indexToEntityMention.put(index, entity);
}
// int i =0;
for (String word : words) {
CoreLabel label = new CoreLabel();
label.setWord(word);
//label.setTag(postags.get(i));
label.set(CoreAnnotations.TextAnnotation.class, word);
label.set(CoreAnnotations.ValueAnnotation.class, word);
// we don't set TokenBeginAnnotation or TokenEndAnnotation since we're
// not keeping track of character offsets
tokens.add(label);
// i++;
}
textContent.append(text);
textContent.append(' ');
tokenCount += words.size();
break;
}
}
sentence.set(CoreAnnotations.TextAnnotation.class, textContent.toString());
sentence.set(CoreAnnotations.ValueAnnotation.class, textContent.toString());
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
sentence.set(CoreAnnotations.SentenceIDAnnotation.class, sentenceID);
return sentence;
}
use of edu.stanford.nlp.ie.machinereading.structure.EntityMention in project CoreNLP by stanfordnlp.
the class AceReader method readDocument.
/**
* Reads in a single ACE*.apf.xml file and convert it to RelationSentence
* objects. However, you probably should call parse() instead.
*
* @param prefix prefix of ACE filename to read (e.g.
* "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01"
* ) (no ".apf.xml" extension)
* @return list of RelationSentence objects
*/
private List<CoreMap> readDocument(String prefix, Annotation corpus) throws IOException, SAXException, ParserConfigurationException {
logger.info("Reading document: " + prefix);
List<CoreMap> results = new ArrayList<>();
AceDocument aceDocument;
if (aceVersion.equals("ACE2004")) {
aceDocument = AceDocument.parseDocument(prefix, false, aceVersion);
} else {
aceDocument = AceDocument.parseDocument(prefix, false);
}
String docId = aceDocument.getId();
// map entity mention ID strings to their EntityMention counterparts
Map<String, EntityMention> entityMentionMap = Generics.newHashMap();
/*
for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
StringBuffer b = new StringBuffer();
for(AceToken t: tokens) b.append(t.getLiteral() + " " );
logger.info("SENTENCE: " + b.toString());
}
*/
int tokenOffset = 0;
for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
List<CoreLabel> words = new ArrayList<>();
StringBuilder textContent = new StringBuilder();
for (int i = 0; i < tokens.size(); i++) {
CoreLabel l = new CoreLabel();
l.setWord(tokens.get(i).getLiteral());
l.set(CoreAnnotations.ValueAnnotation.class, l.word());
l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokens.get(i).getByteStart());
l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokens.get(i).getByteEnd());
words.add(l);
if (i > 0)
textContent.append(" ");
textContent.append(tokens.get(i).getLiteral());
}
// skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer)
if (words.size() == 1) {
String word = words.get(0).word();
if (word.startsWith("<") && word.endsWith(">")) {
tokenOffset += tokens.size();
continue;
}
}
CoreMap sentence = new Annotation(textContent.toString());
sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
sentence.set(CoreAnnotations.TokensAnnotation.class, words);
logger.info("Reading sentence: \"" + textContent + "\"");
List<AceEntityMention> entityMentions = aceDocument.getEntityMentions(sentenceIndex);
List<AceRelationMention> relationMentions = aceDocument.getRelationMentions(sentenceIndex);
List<AceEventMention> eventMentions = aceDocument.getEventMentions(sentenceIndex);
// convert entity mentions
for (AceEntityMention aceEntityMention : entityMentions) {
String corefID = "";
for (String entityID : aceDocument.getKeySetEntities()) {
AceEntity e = aceDocument.getEntity(entityID);
if (e.getMentions().contains(aceEntityMention)) {
corefID = entityID;
break;
}
}
EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID);
// EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset);
entityCounts.incrementCount(convertedMention.getType());
logger.info("CONVERTED MENTION HEAD SPAN: " + convertedMention.getHead());
logger.info("CONVERTED ENTITY MENTION: " + convertedMention);
AnnotationUtils.addEntityMention(sentence, convertedMention);
entityMentionMap.put(aceEntityMention.getId(), convertedMention);
// TODO: make Entity objects as needed
}
// convert relation mentions
for (AceRelationMention aceRelationMention : relationMentions) {
RelationMention convertedMention = convertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap);
if (convertedMention != null) {
relationCounts.incrementCount(convertedMention.getType());
logger.info("CONVERTED RELATION MENTION: " + convertedMention);
AnnotationUtils.addRelationMention(sentence, convertedMention);
}
// TODO: make Relation objects
}
// convert EventMentions
for (AceEventMention aceEventMention : eventMentions) {
EventMention convertedMention = convertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset);
if (convertedMention != null) {
eventCounts.incrementCount(convertedMention.getType());
logger.info("CONVERTED EVENT MENTION: " + convertedMention);
AnnotationUtils.addEventMention(sentence, convertedMention);
}
// TODO: make Event objects
}
results.add(sentence);
tokenOffset += tokens.size();
}
return results;
}
use of edu.stanford.nlp.ie.machinereading.structure.EntityMention in project CoreNLP by stanfordnlp.
the class ProtobufAnnotationSerializer method toProtoBuilder.
/**
* <p>
* The method to extend by subclasses of the Protobuf Annotator if custom additions are added to Tokens.
* In contrast to {@link ProtobufAnnotationSerializer#toProto(edu.stanford.nlp.ling.CoreLabel)}, this function
* returns a builder that can be extended.
* </p>
*
* @param sentence The sentence to save to a protocol buffer
* @param keysToSerialize A set tracking which keys have been saved. It's important to remove any keys added to the proto
* from this set, as the code tracks annotations to ensure lossless serialization.
*/
@SuppressWarnings("deprecation")
protected CoreNLPProtos.Sentence.Builder toProtoBuilder(CoreMap sentence, Set<Class<?>> keysToSerialize) {
// Error checks
if (sentence instanceof CoreLabel) {
throw new IllegalArgumentException("CoreMap is actually a CoreLabel");
}
CoreNLPProtos.Sentence.Builder builder = CoreNLPProtos.Sentence.newBuilder();
// Remove items serialized elsewhere from the required list
keysToSerialize.remove(TextAnnotation.class);
keysToSerialize.remove(NumerizedTokensAnnotation.class);
// Required fields
builder.setTokenOffsetBegin(getAndRegister(sentence, keysToSerialize, TokenBeginAnnotation.class));
builder.setTokenOffsetEnd(getAndRegister(sentence, keysToSerialize, TokenEndAnnotation.class));
// Get key set of CoreMap
Set<Class<?>> keySet;
if (sentence instanceof ArrayCoreMap) {
keySet = ((ArrayCoreMap) sentence).keySetNotNull();
} else {
keySet = new IdentityHashSet<>(sentence.keySet());
}
// Tokens
if (sentence.containsKey(TokensAnnotation.class)) {
for (CoreLabel tok : sentence.get(TokensAnnotation.class)) {
builder.addToken(toProto(tok));
}
keysToSerialize.remove(TokensAnnotation.class);
}
// Characters
if (sentence.containsKey(SegmenterCoreAnnotations.CharactersAnnotation.class)) {
for (CoreLabel c : sentence.get(SegmenterCoreAnnotations.CharactersAnnotation.class)) {
builder.addCharacter(toProto(c));
}
keysToSerialize.remove(SegmenterCoreAnnotations.CharactersAnnotation.class);
}
// Optional fields
if (keySet.contains(SentenceIndexAnnotation.class)) {
builder.setSentenceIndex(getAndRegister(sentence, keysToSerialize, SentenceIndexAnnotation.class));
}
if (keySet.contains(CharacterOffsetBeginAnnotation.class)) {
builder.setCharacterOffsetBegin(getAndRegister(sentence, keysToSerialize, CharacterOffsetBeginAnnotation.class));
}
if (keySet.contains(CharacterOffsetEndAnnotation.class)) {
builder.setCharacterOffsetEnd(getAndRegister(sentence, keysToSerialize, CharacterOffsetEndAnnotation.class));
}
if (keySet.contains(TreeAnnotation.class)) {
builder.setParseTree(toProto(getAndRegister(sentence, keysToSerialize, TreeAnnotation.class)));
}
if (keySet.contains(BinarizedTreeAnnotation.class)) {
builder.setBinarizedParseTree(toProto(getAndRegister(sentence, keysToSerialize, BinarizedTreeAnnotation.class)));
}
if (keySet.contains(KBestTreesAnnotation.class)) {
for (Tree tree : sentence.get(KBestTreesAnnotation.class)) {
builder.addKBestParseTrees(toProto(tree));
keysToSerialize.remove(KBestTreesAnnotation.class);
}
}
if (keySet.contains(SentimentCoreAnnotations.SentimentAnnotatedTree.class)) {
builder.setAnnotatedParseTree(toProto(getAndRegister(sentence, keysToSerialize, SentimentCoreAnnotations.SentimentAnnotatedTree.class)));
}
if (keySet.contains(SentimentCoreAnnotations.SentimentClass.class)) {
builder.setSentiment(getAndRegister(sentence, keysToSerialize, SentimentCoreAnnotations.SentimentClass.class));
}
if (keySet.contains(BasicDependenciesAnnotation.class)) {
builder.setBasicDependencies(toProto(getAndRegister(sentence, keysToSerialize, BasicDependenciesAnnotation.class)));
}
if (keySet.contains(CollapsedDependenciesAnnotation.class)) {
builder.setCollapsedDependencies(toProto(getAndRegister(sentence, keysToSerialize, CollapsedDependenciesAnnotation.class)));
}
if (keySet.contains(CollapsedCCProcessedDependenciesAnnotation.class)) {
builder.setCollapsedCCProcessedDependencies(toProto(getAndRegister(sentence, keysToSerialize, CollapsedCCProcessedDependenciesAnnotation.class)));
}
if (keySet.contains(AlternativeDependenciesAnnotation.class)) {
builder.setAlternativeDependencies(toProto(getAndRegister(sentence, keysToSerialize, AlternativeDependenciesAnnotation.class)));
}
if (keySet.contains(EnhancedDependenciesAnnotation.class)) {
builder.setEnhancedDependencies(toProto(getAndRegister(sentence, keysToSerialize, EnhancedDependenciesAnnotation.class)));
}
if (keySet.contains(EnhancedPlusPlusDependenciesAnnotation.class)) {
builder.setEnhancedPlusPlusDependencies(toProto(getAndRegister(sentence, keysToSerialize, EnhancedPlusPlusDependenciesAnnotation.class)));
}
if (keySet.contains(TokensAnnotation.class) && getAndRegister(sentence, keysToSerialize, TokensAnnotation.class).size() > 0 && getAndRegister(sentence, keysToSerialize, TokensAnnotation.class).get(0).containsKey(ParagraphAnnotation.class)) {
builder.setParagraph(getAndRegister(sentence, keysToSerialize, TokensAnnotation.class).get(0).get(ParagraphAnnotation.class));
}
if (keySet.contains(NumerizedTokensAnnotation.class)) {
builder.setHasNumerizedTokensAnnotation(true);
} else {
builder.setHasNumerizedTokensAnnotation(false);
}
if (keySet.contains(NaturalLogicAnnotations.EntailedSentencesAnnotation.class)) {
for (SentenceFragment entailedSentence : getAndRegister(sentence, keysToSerialize, NaturalLogicAnnotations.EntailedSentencesAnnotation.class)) {
builder.addEntailedSentence(toProto(entailedSentence));
}
}
if (keySet.contains(NaturalLogicAnnotations.EntailedClausesAnnotation.class)) {
for (SentenceFragment entailedClause : getAndRegister(sentence, keysToSerialize, NaturalLogicAnnotations.EntailedClausesAnnotation.class)) {
builder.addEntailedClause(toProto(entailedClause));
}
}
if (keySet.contains(NaturalLogicAnnotations.RelationTriplesAnnotation.class)) {
for (RelationTriple triple : getAndRegister(sentence, keysToSerialize, NaturalLogicAnnotations.RelationTriplesAnnotation.class)) {
builder.addOpenieTriple(toProto(triple));
}
}
if (keySet.contains(KBPTriplesAnnotation.class)) {
for (RelationTriple triple : getAndRegister(sentence, keysToSerialize, KBPTriplesAnnotation.class)) {
builder.addKbpTriple(toProto(triple));
}
}
// Non-default annotators
if (keySet.contains(EntityMentionsAnnotation.class)) {
builder.setHasRelationAnnotations(true);
for (EntityMention entity : getAndRegister(sentence, keysToSerialize, EntityMentionsAnnotation.class)) {
builder.addEntity(toProto(entity));
}
} else {
builder.setHasRelationAnnotations(false);
}
if (keySet.contains(RelationMentionsAnnotation.class)) {
if (!builder.getHasRelationAnnotations()) {
throw new IllegalStateException("Registered entity mentions without relation mentions");
}
for (RelationMention relation : getAndRegister(sentence, keysToSerialize, RelationMentionsAnnotation.class)) {
builder.addRelation(toProto(relation));
}
}
// add each of the mentions in the List<Mentions> for this sentence
if (keySet.contains(CorefMentionsAnnotation.class)) {
builder.setHasCorefMentionsAnnotation(true);
for (Mention m : sentence.get(CorefMentionsAnnotation.class)) {
builder.addMentionsForCoref(toProto(m));
}
keysToSerialize.remove(CorefMentionsAnnotation.class);
}
// Entity mentions
if (keySet.contains(MentionsAnnotation.class)) {
for (CoreMap mention : sentence.get(MentionsAnnotation.class)) {
builder.addMentions(toProtoMention(mention));
}
keysToSerialize.remove(MentionsAnnotation.class);
}
// add a sentence id if it exists
if (keySet.contains(SentenceIDAnnotation.class))
builder.setSentenceID(getAndRegister(sentence, keysToSerialize, SentenceIDAnnotation.class));
// Return
return builder;
}
Aggregations