use of edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceToken in project CoreNLP by stanfordnlp.
the class AceReader method readDocument.
/**
* Reads in a single ACE*.apf.xml file and convert it to RelationSentence
* objects. However, you probably should call parse() instead.
*
* @param prefix prefix of ACE filename to read (e.g.
* "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01"
* ) (no ".apf.xml" extension)
* @return list of RelationSentence objects
*/
private List<CoreMap> readDocument(String prefix, Annotation corpus) throws IOException, SAXException, ParserConfigurationException {
logger.info("Reading document: " + prefix);
List<CoreMap> results = new ArrayList<>();
AceDocument aceDocument;
if (aceVersion.equals("ACE2004")) {
aceDocument = AceDocument.parseDocument(prefix, false, aceVersion);
} else {
aceDocument = AceDocument.parseDocument(prefix, false);
}
String docId = aceDocument.getId();
// map entity mention ID strings to their EntityMention counterparts
Map<String, EntityMention> entityMentionMap = Generics.newHashMap();
/*
for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
StringBuilder b = new StringBuilder();
for(AceToken t: tokens) b.append(t.getLiteral() + " " );
logger.info("SENTENCE: " + b.toString());
}
*/
int tokenOffset = 0;
for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
List<CoreLabel> words = new ArrayList<>();
StringBuilder textContent = new StringBuilder();
for (int i = 0; i < tokens.size(); i++) {
CoreLabel l = new CoreLabel();
l.setWord(tokens.get(i).getLiteral());
l.set(CoreAnnotations.ValueAnnotation.class, l.word());
l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokens.get(i).getByteStart());
l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokens.get(i).getByteEnd());
words.add(l);
if (i > 0)
textContent.append(" ");
textContent.append(tokens.get(i).getLiteral());
}
// skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer)
if (words.size() == 1) {
String word = words.get(0).word();
if (word.startsWith("<") && word.endsWith(">")) {
tokenOffset += tokens.size();
continue;
}
}
CoreMap sentence = new Annotation(textContent.toString());
sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
sentence.set(CoreAnnotations.TokensAnnotation.class, words);
logger.info("Reading sentence: \"" + textContent + "\"");
List<AceEntityMention> entityMentions = aceDocument.getEntityMentions(sentenceIndex);
List<AceRelationMention> relationMentions = aceDocument.getRelationMentions(sentenceIndex);
List<AceEventMention> eventMentions = aceDocument.getEventMentions(sentenceIndex);
// convert entity mentions
for (AceEntityMention aceEntityMention : entityMentions) {
String corefID = "";
for (String entityID : aceDocument.getKeySetEntities()) {
AceEntity e = aceDocument.getEntity(entityID);
if (e.getMentions().contains(aceEntityMention)) {
corefID = entityID;
break;
}
}
EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID);
// EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset);
entityCounts.incrementCount(convertedMention.getType());
logger.info("CONVERTED MENTION HEAD SPAN: " + convertedMention.getHead());
logger.info("CONVERTED ENTITY MENTION: " + convertedMention);
AnnotationUtils.addEntityMention(sentence, convertedMention);
entityMentionMap.put(aceEntityMention.getId(), convertedMention);
// TODO: make Entity objects as needed
}
// convert relation mentions
for (AceRelationMention aceRelationMention : relationMentions) {
RelationMention convertedMention = convertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap);
if (convertedMention != null) {
relationCounts.incrementCount(convertedMention.getType());
logger.info("CONVERTED RELATION MENTION: " + convertedMention);
AnnotationUtils.addRelationMention(sentence, convertedMention);
}
// TODO: make Relation objects
}
// convert EventMentions
for (AceEventMention aceEventMention : eventMentions) {
EventMention convertedMention = convertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset);
if (convertedMention != null) {
eventCounts.incrementCount(convertedMention.getType());
logger.info("CONVERTED EVENT MENTION: " + convertedMention);
AnnotationUtils.addEventMention(sentence, convertedMention);
}
// TODO: make Event objects
}
results.add(sentence);
tokenOffset += tokens.size();
}
return results;
}
Aggregations