use of edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention in project CoreNLP by stanfordnlp.
the class AceReader method readDocument.
/**
* Reads in a single ACE*.apf.xml file and convert it to RelationSentence
* objects. However, you probably should call parse() instead.
*
* @param prefix prefix of ACE filename to read (e.g.
* "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01"
* ) (no ".apf.xml" extension)
* @return list of RelationSentence objects
*/
private List<CoreMap> readDocument(String prefix, Annotation corpus) throws IOException, SAXException, ParserConfigurationException {
logger.info("Reading document: " + prefix);
List<CoreMap> results = new ArrayList<>();
AceDocument aceDocument;
if (aceVersion.equals("ACE2004")) {
aceDocument = AceDocument.parseDocument(prefix, false, aceVersion);
} else {
aceDocument = AceDocument.parseDocument(prefix, false);
}
String docId = aceDocument.getId();
// map entity mention ID strings to their EntityMention counterparts
Map<String, EntityMention> entityMentionMap = Generics.newHashMap();
/*
for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
StringBuffer b = new StringBuffer();
for(AceToken t: tokens) b.append(t.getLiteral() + " " );
logger.info("SENTENCE: " + b.toString());
}
*/
int tokenOffset = 0;
for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
List<CoreLabel> words = new ArrayList<>();
StringBuilder textContent = new StringBuilder();
for (int i = 0; i < tokens.size(); i++) {
CoreLabel l = new CoreLabel();
l.setWord(tokens.get(i).getLiteral());
l.set(CoreAnnotations.ValueAnnotation.class, l.word());
l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokens.get(i).getByteStart());
l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokens.get(i).getByteEnd());
words.add(l);
if (i > 0)
textContent.append(" ");
textContent.append(tokens.get(i).getLiteral());
}
// skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer)
if (words.size() == 1) {
String word = words.get(0).word();
if (word.startsWith("<") && word.endsWith(">")) {
tokenOffset += tokens.size();
continue;
}
}
CoreMap sentence = new Annotation(textContent.toString());
sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
sentence.set(CoreAnnotations.TokensAnnotation.class, words);
logger.info("Reading sentence: \"" + textContent + "\"");
List<AceEntityMention> entityMentions = aceDocument.getEntityMentions(sentenceIndex);
List<AceRelationMention> relationMentions = aceDocument.getRelationMentions(sentenceIndex);
List<AceEventMention> eventMentions = aceDocument.getEventMentions(sentenceIndex);
// convert entity mentions
for (AceEntityMention aceEntityMention : entityMentions) {
String corefID = "";
for (String entityID : aceDocument.getKeySetEntities()) {
AceEntity e = aceDocument.getEntity(entityID);
if (e.getMentions().contains(aceEntityMention)) {
corefID = entityID;
break;
}
}
EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID);
// EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset);
entityCounts.incrementCount(convertedMention.getType());
logger.info("CONVERTED MENTION HEAD SPAN: " + convertedMention.getHead());
logger.info("CONVERTED ENTITY MENTION: " + convertedMention);
AnnotationUtils.addEntityMention(sentence, convertedMention);
entityMentionMap.put(aceEntityMention.getId(), convertedMention);
// TODO: make Entity objects as needed
}
// convert relation mentions
for (AceRelationMention aceRelationMention : relationMentions) {
RelationMention convertedMention = convertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap);
if (convertedMention != null) {
relationCounts.incrementCount(convertedMention.getType());
logger.info("CONVERTED RELATION MENTION: " + convertedMention);
AnnotationUtils.addRelationMention(sentence, convertedMention);
}
// TODO: make Relation objects
}
// convert EventMentions
for (AceEventMention aceEventMention : eventMentions) {
EventMention convertedMention = convertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset);
if (convertedMention != null) {
eventCounts.incrementCount(convertedMention.getType());
logger.info("CONVERTED EVENT MENTION: " + convertedMention);
AnnotationUtils.addEventMention(sentence, convertedMention);
}
// TODO: make Event objects
}
results.add(sentence);
tokenOffset += tokens.size();
}
return results;
}
use of edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention in project CoreNLP by stanfordnlp.
the class AceReader method convertAceEventMention.
private EventMention convertAceEventMention(AceEventMention aceEventMention, String docId, CoreMap sentence, Map<String, EntityMention> entityMap, int tokenOffset) {
Set<String> roleSet = aceEventMention.getRoles();
List<String> roles = new ArrayList<>();
for (String role : roleSet) roles.add(role);
List<ExtractionObject> convertedArgs = new ArrayList<>();
int left = Integer.MAX_VALUE;
int right = Integer.MIN_VALUE;
for (String role : roles) {
AceEntityMention arg = aceEventMention.getArg(role);
ExtractionObject o = entityMap.get(arg.getId());
if (o == null) {
logger.severe("READER ERROR: Failed to find event argument with id " + arg.getId());
logger.severe("This happens because a few event mentions illegally span multiple sentences. Will ignore this mention.");
return null;
}
convertedArgs.add(o);
if (o.getExtentTokenStart() < left)
left = o.getExtentTokenStart();
if (o.getExtentTokenEnd() > right)
right = o.getExtentTokenEnd();
}
AceCharSeq anchor = aceEventMention.getAnchor();
ExtractionObject anchorObject = new ExtractionObject(aceEventMention.getId() + "-anchor", sentence, new Span(anchor.getTokenStart() - tokenOffset, anchor.getTokenEnd() + 1 - tokenOffset), "ANCHOR", null);
EventMention em = new EventMention(aceEventMention.getId(), sentence, new Span(left, right), aceEventMention.getParent().getType(), aceEventMention.getParent().getSubtype(), anchorObject, convertedArgs, roles);
return em;
}
use of edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention in project CoreNLP by stanfordnlp.
the class AceReader method convertAceEntityMention.
/**
* Convert an {@link AceEntityMention} to an {@link EntityMention}.
*
* @param entityMention {@link AceEntityMention} to convert
* @param docId ID of the document containing this entity mention
* @param sentence
* @param tokenOffset An offset in the calculations of position of the extent to sentence boundary
* (the ace.reader stores absolute token offset from the beginning of the document, but
* we need token offsets from the beginning of the sentence => adjust by tokenOffset)
* @return entity as an {@link EntityMention}
*/
private EntityMention convertAceEntityMention(AceEntityMention entityMention, String docId, CoreMap sentence, int tokenOffset) {
//log.info("TYPE is " + entityMention.getParent().getType());
//log.info("SUBTYPE is " + entityMention.getParent().getSubtype());
//log.info("LDCTYPE is " + entityMention.getLdctype());
AceCharSeq ext = entityMention.getExtent();
AceCharSeq head = entityMention.getHead();
int extStart = ext.getTokenStart() - tokenOffset;
int extEnd = ext.getTokenEnd() - tokenOffset + 1;
if (extStart < 0) {
logger.severe("READER ERROR: Invalid extent start " + extStart + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
logger.severe("This may happen due to incorrect EOS detection. Adjusting entity extent.");
extStart = 0;
}
if (extEnd > sentence.get(CoreAnnotations.TokensAnnotation.class).size()) {
logger.severe("READER ERROR: Invalid extent end " + extEnd + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
logger.severe("This may happen due to incorrect EOS detection. Adjusting entity extent.");
extEnd = sentence.get(CoreAnnotations.TokensAnnotation.class).size();
}
int headStart = head.getTokenStart() - tokenOffset;
int headEnd = head.getTokenEnd() - tokenOffset + 1;
if (headStart < 0) {
logger.severe("READER ERROR: Invalid head start " + headStart + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
logger.severe("This may happen due to incorrect EOS detection. Adjusting entity head span.");
headStart = 0;
}
if (headEnd > sentence.get(CoreAnnotations.TokensAnnotation.class).size()) {
logger.severe("READER ERROR: Invalid head end " + headEnd + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
logger.severe("This may happen due to incorrect EOS detection. Adjusting entity head span.");
headEnd = sentence.get(CoreAnnotations.TokensAnnotation.class).size();
}
// must adjust due to possible incorrect EOS detection
if (headStart < extStart) {
headStart = extStart;
}
if (headEnd > extEnd) {
headEnd = extEnd;
}
assert (headStart < headEnd);
// note: the ace.reader stores absolute token offset from the beginning of the document, but
// we need token offsets from the beginning of the sentence => adjust by tokenOffset
// note: in ace.reader the end token position is inclusive, but
// in our setup the end token position is exclusive => add 1 to end
EntityMention converted = new EntityMention(entityMention.getId(), sentence, new Span(extStart, extEnd), new Span(headStart, headEnd), entityMention.getParent().getType(), entityMention.getParent().getSubtype(), entityMention.getLdctype());
return converted;
}
Aggregations