use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class CoNLLDocumentReader method extractGoldMentions.
// extract gold mentions (mention span, mention ID, cluster ID)
public List<List<Mention>> extractGoldMentions(CoNLLDocument conllDoc) {
List<CoreMap> sentences = conllDoc.getAnnotation().get(CoreAnnotations.SentencesAnnotation.class);
List<List<Mention>> allGoldMentions = new ArrayList<>();
CollectionValuedMap<String, CoreMap> corefChainMap = conllDoc.getCorefChainMap();
for (int i = 0; i < sentences.size(); i++) {
allGoldMentions.add(new ArrayList<>());
}
for (String corefIdStr : corefChainMap.keySet()) {
int id = Integer.parseInt(corefIdStr);
for (CoreMap m : corefChainMap.get(corefIdStr)) {
Mention mention = new Mention();
mention.goldCorefClusterID = id;
int sentIndex = m.get(CoreAnnotations.SentenceIndexAnnotation.class);
CoreMap sent = sentences.get(sentIndex);
mention.startIndex = m.get(CoreAnnotations.TokenBeginAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class);
mention.endIndex = m.get(CoreAnnotations.TokenEndAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class);
mention.originalSpan = m.get(CoreAnnotations.TokensAnnotation.class);
allGoldMentions.get(sentIndex).add(mention);
}
}
return allGoldMentions;
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class CoNLLDocumentReader method main.
/** Reads and dumps output, mainly for debugging. */
public static void main(String[] args) throws IOException {
Properties props = StringUtils.argsToProperties(args);
boolean debug = false;
String filepath = props.getProperty("i");
String outfile = props.getProperty("o");
if (filepath == null || outfile == null) {
usage();
System.exit(-1);
}
PrintWriter fout = new PrintWriter(outfile);
logger.info("Writing to " + outfile);
String ext = props.getProperty("ext");
Options options;
if (ext != null) {
options = new Options(".*" + ext + "$");
} else {
options = new Options();
}
options.annotateTreeCoref = true;
options.annotateTreeNer = true;
CorpusStats corpusStats = new CorpusStats();
CoNLLDocumentReader reader = new CoNLLDocumentReader(filepath, options);
int docCnt = 0;
int sentCnt = 0;
int tokenCnt = 0;
for (CoNLLDocument doc; (doc = reader.getNextDocument()) != null; ) {
corpusStats.process(doc);
docCnt++;
Annotation anno = doc.getAnnotation();
if (debug)
System.out.println("Document " + docCnt + ": " + anno.get(CoreAnnotations.DocIDAnnotation.class));
for (CoreMap sentence : anno.get(CoreAnnotations.SentencesAnnotation.class)) {
if (debug)
System.out.println("Parse: " + sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
if (debug)
System.out.println("Sentence Tokens: " + StringUtils.join(sentence.get(CoreAnnotations.TokensAnnotation.class), ","));
writeTabSep(fout, sentence, doc.corefChainMap);
sentCnt++;
tokenCnt += sentence.get(CoreAnnotations.TokensAnnotation.class).size();
}
if (debug) {
for (CoreMap ner : doc.nerChunks) {
System.out.println("NER Chunk: " + ner);
}
for (String id : doc.corefChainMap.keySet()) {
System.out.println("Coref: " + id + " = " + StringUtils.join(doc.corefChainMap.get(id), ";"));
}
}
}
fout.close();
System.out.println("Total document count: " + docCnt);
System.out.println("Total sentence count: " + sentCnt);
System.out.println("Total token count: " + tokenCnt);
System.out.println(corpusStats);
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class HybridCorefPrinter method printMentionDetectionLog.
public static String printMentionDetectionLog(Document document) {
StringBuilder sbLog = new StringBuilder();
List<CoreMap> sentences = document.annotation.get(SentencesAnnotation.class);
sbLog.append("\nERROR START-----------------------------------------------------------------------\n");
for (int i = 0; i < sentences.size(); i++) {
sbLog.append("\nSENT ").append(i).append(" GOLD : ").append(HybridCorefPrinter.sentenceStringWithMention(i, document, true, false)).append("\n");
sbLog.append("SENT ").append(i).append(" PREDICT: ").append(HybridCorefPrinter.sentenceStringWithMention(i, document, false, false)).append("\n");
for (Mention p : document.predictedMentions.get(i)) {
sbLog.append("\n");
if (!p.hasTwin)
sbLog.append("\tSPURIOUS");
sbLog.append("\tmention: ").append(p.spanToString()).append("\t\t\theadword: ").append(p.headString).append("\tPOS: ").append(p.headWord.tag()).append("\tmentiontype: ").append(p.mentionType).append("\tnumber: ").append(p.number).append("\tgender: ").append(p.gender).append("\tanimacy: ").append(p.animacy).append("\tperson: ").append(p.person).append("\tNE: ").append(p.nerString);
}
sbLog.append("\n");
for (Mention g : document.goldMentions.get(i)) {
if (!g.hasTwin) {
sbLog.append("\tmissed gold: ").append(g.spanToString()).append("\tPOS: ").append(g.headWord.tag()).append("\tmentiontype: ").append(g.mentionType).append("\theadword: ").append(g.headString).append("\tnumber: ").append(g.number).append("\tgender: ").append(g.gender).append("\tanimacy: ").append(g.animacy).append("\tperson: ").append(g.person).append("\tNE: ").append(g.nerString).append("\n");
if (g.sentenceWords != null)
if (g.sentenceWords.size() > g.endIndex)
sbLog.append("\tnextword: ").append(g.sentenceWords.get(g.endIndex)).append("\t").append(g.sentenceWords.get(g.endIndex).tag()).append("\n");
if (g.contextParseTree != null)
sbLog.append(g.contextParseTree.pennString()).append("\n\n");
else
sbLog.append("\n\n");
}
}
if (sentences.get(i).get(TreeAnnotation.class) != null)
sbLog.append("\n\tparse: \n").append(sentences.get(i).get(TreeAnnotation.class).pennString());
sbLog.append("\n\tcollapsedDependency: \n").append(sentences.get(i).get(BasicDependenciesAnnotation.class));
}
sbLog.append("ERROR END -----------------------------------------------------------------------\n");
return sbLog.toString();
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class HybridCorefPrinter method sentenceStringWithMention.
public static String sentenceStringWithMention(int i, Document document, boolean gold, boolean printClusterID) {
StringBuilder sentStr = new StringBuilder();
List<CoreMap> sentences = document.annotation.get(CoreAnnotations.SentencesAnnotation.class);
List<List<Mention>> allMentions;
if (gold) {
allMentions = document.goldMentions;
} else {
allMentions = document.predictedMentions;
}
// String filename = document.annotation.get()
int previousOffset = 0;
CoreMap sentence = sentences.get(i);
List<Mention> mentions = allMentions.get(i);
List<CoreLabel> t = sentence.get(CoreAnnotations.TokensAnnotation.class);
String speaker = t.get(0).get(SpeakerAnnotation.class);
if (NumberMatchingRegex.isDecimalInteger(speaker))
speaker = speaker + ": " + document.predictedMentionsByID.get(Integer.parseInt(speaker)).spanToString();
sentStr.append("\tspeaker: " + speaker + " (" + t.get(0).get(UtteranceAnnotation.class) + ") ");
String[] tokens = new String[t.size()];
for (CoreLabel c : t) {
tokens[c.index() - 1] = c.word();
}
// if(previousOffset+2 < t.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) && printClusterID) {
// sentStr.append("\n");
// }
previousOffset = t.get(t.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
Counter<Integer> startCounts = new ClassicCounter<>();
Counter<Integer> endCounts = new ClassicCounter<>();
Map<Integer, Deque<Mention>> endMentions = Generics.newHashMap();
for (Mention m : mentions) {
// if(!gold && (document.corefClusters.get(m.corefClusterID)==null || document.corefClusters.get(m.corefClusterID).getCorefMentions().size()<=1)) {
// continue;
// }
startCounts.incrementCount(m.startIndex);
endCounts.incrementCount(m.endIndex);
if (!endMentions.containsKey(m.endIndex))
endMentions.put(m.endIndex, new ArrayDeque<>());
endMentions.get(m.endIndex).push(m);
}
for (int j = 0; j < tokens.length; j++) {
if (endMentions.containsKey(j)) {
for (Mention m : endMentions.get(j)) {
int id = (gold) ? m.goldCorefClusterID : m.corefClusterID;
id = (printClusterID) ? id : m.mentionID;
sentStr.append("]_").append(id);
}
}
for (int k = 0; k < startCounts.getCount(j); k++) {
if (sentStr.length() > 0 && sentStr.charAt(sentStr.length() - 1) != '[')
sentStr.append(" ");
sentStr.append("[");
}
if (sentStr.length() > 0 && sentStr.charAt(sentStr.length() - 1) != '[')
sentStr.append(" ");
sentStr.append(tokens[j]);
}
if (endMentions.containsKey(tokens.length)) {
for (Mention m : endMentions.get(tokens.length)) {
int id = (gold) ? m.goldCorefClusterID : m.corefClusterID;
id = (printClusterID) ? id : m.mentionID;
//append("_").append(m.mentionID);
sentStr.append("]_").append(id);
}
}
return sentStr.toString();
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class CorefMentionFinder method parse.
private Tree parse(List<CoreLabel> tokens, List<ParserConstraint> constraints) {
CoreMap sent = new Annotation("");
sent.set(CoreAnnotations.TokensAnnotation.class, tokens);
sent.set(ParserAnnotations.ConstraintAnnotation.class, constraints);
Annotation doc = new Annotation("");
List<CoreMap> sents = new ArrayList<>(1);
sents.add(sent);
doc.set(CoreAnnotations.SentencesAnnotation.class, sents);
getParser().annotate(doc);
sents = doc.get(CoreAnnotations.SentencesAnnotation.class);
return sents.get(0).get(TreeCoreAnnotations.TreeAnnotation.class);
}
Aggregations