use of edu.stanford.nlp.ling.IndexedWord in project CoreNLP by stanfordnlp.
the class UniversalDependenciesFeatureAnnotator method addFeatures.
public void addFeatures(SemanticGraph sg, Tree t, boolean addLemma, boolean addUPOS) {
Set<Integer> imperatives = t != null ? getImperatives(t) : new HashSet<>();
for (IndexedWord word : sg.vertexListSorted()) {
String posTag = word.get(CoreAnnotations.PartOfSpeechAnnotation.class);
String token = word.get(CoreAnnotations.TextAnnotation.class);
Integer index = word.get(CoreAnnotations.IndexAnnotation.class);
HashMap<String, String> wordFeatures = word.get(CoreAnnotations.CoNLLUFeats.class);
if (wordFeatures == null) {
wordFeatures = new HashMap<>();
word.set(CoreAnnotations.CoNLLUFeats.class, wordFeatures);
}
/* Features that only depend on the word and the PTB POS tag. */
wordFeatures.putAll(getPOSFeatures(token, posTag));
/* Semantic graph features. */
wordFeatures.putAll(getGraphFeatures(sg, word));
/* Handle VBs. */
if (imperatives.contains(index)) {
/* Imperative */
wordFeatures.put("VerbForm", "Fin");
wordFeatures.put("Mood", "Imp");
} else if (posTag.equals("VB")) {
/* Infinitive */
wordFeatures.put("VerbForm", "Inf");
/* Subjunctive detection too unreliable. */
//} else {
// /* Present subjunctive */
// wordFeatures.put("VerbForm", "Fin");
// wordFeatures.put("Tense", "Pres");
// wordFeatures.put("Mood", "Subj");
//}
}
String lemma = word.get(CoreAnnotations.LemmaAnnotation.class);
if (addLemma && (lemma == null || lemma.equals("_"))) {
word.set(CoreAnnotations.LemmaAnnotation.class, morphology.lemma(token, posTag));
}
}
if (addUPOS && t != null) {
t = UniversalPOSMapper.mapTree(t);
List<Label> uPOSTags = t.preTerminalYield();
List<IndexedWord> yield = sg.vertexListSorted();
int len = yield.size();
for (IndexedWord word : yield) {
Label uPOSTag = uPOSTags.get(word.index() - 1);
word.set(CoreAnnotations.CoarseTagAnnotation.class, uPOSTag.value());
}
}
}
use of edu.stanford.nlp.ling.IndexedWord in project CoreNLP by stanfordnlp.
the class TSVUtils method parseTree.
/**
* Parse a CoNLL formatted tree into a SemanticGraph.
* @param conll The CoNLL tree to parse.
* @param tokens The tokens of the sentence, to form the backing labels of the tree.
* @return A semantic graph of the sentence, according to the given tree.
*/
public static SemanticGraph parseTree(String conll, List<CoreLabel> tokens) {
SemanticGraph tree = new SemanticGraph();
if (conll == null || conll.isEmpty()) {
return tree;
}
String[] treeLines = newline.split(conll);
IndexedWord[] vertices = new IndexedWord[tokens.size() + 2];
// Add edges
for (String line : treeLines) {
// Parse row
String[] fields = tab.split(line);
int dependentIndex = Integer.parseInt(fields[0]);
if (vertices[dependentIndex] == null) {
if (dependentIndex > tokens.size()) {
// Bizarre mismatch in sizes; the malt parser seems to do this often
return new SemanticGraph();
}
vertices[dependentIndex] = new IndexedWord(tokens.get(dependentIndex - 1));
}
IndexedWord dependent = vertices[dependentIndex];
int governorIndex = Integer.parseInt(fields[1]);
if (governorIndex > tokens.size()) {
// Bizarre mismatch in sizes; the malt parser seems to do this often
return new SemanticGraph();
}
if (vertices[governorIndex] == null && governorIndex > 0) {
vertices[governorIndex] = new IndexedWord(tokens.get(governorIndex - 1));
}
IndexedWord governor = vertices[governorIndex];
String relation = fields[2];
// Process row
if (governorIndex == 0) {
tree.addRoot(dependent);
} else {
tree.addVertex(dependent);
if (!tree.containsVertex(governor)) {
tree.addVertex(governor);
}
if (!"ref".equals(relation)) {
tree.addEdge(governor, dependent, GrammaticalRelation.valueOf(Language.English, relation), Double.NEGATIVE_INFINITY, false);
}
}
}
return tree;
}
use of edu.stanford.nlp.ling.IndexedWord in project CoreNLP by stanfordnlp.
the class ProtobufAnnotationSerializer method fromProto.
/**
* Returns a complete document, intended to mimic a document passes as input to
* {@link ProtobufAnnotationSerializer#toProto(Annotation)} as closely as possible.
* That is, most common fields are serialized, but there is not guarantee that custom additions
* will be saved and retrieved.
*
* @param proto The protocol buffer to read the document from.
* @return An Annotation corresponding to the read protobuf.
*/
@SuppressWarnings("deprecation")
public Annotation fromProto(CoreNLPProtos.Document proto) {
if (Thread.interrupted()) {
throw new RuntimeInterruptedException();
}
// Set text
Annotation ann = new Annotation(proto.getText());
// if there are characters, add characters
if (proto.getCharacterCount() > 0) {
List<CoreLabel> docChars = new ArrayList<CoreLabel>();
for (CoreNLPProtos.Token c : proto.getCharacterList()) {
docChars.add(fromProto(c));
}
ann.set(SegmenterCoreAnnotations.CharactersAnnotation.class, docChars);
}
// Add tokens
List<CoreLabel> tokens = new ArrayList<>();
if (proto.getSentenceCount() > 0) {
// Populate the tokens from the sentence
for (CoreNLPProtos.Sentence sentence : proto.getSentenceList()) {
// It's conceivable that the sentences are not contiguous -- pad this with nulls
while (sentence.hasTokenOffsetBegin() && tokens.size() < sentence.getTokenOffsetBegin()) {
tokens.add(null);
}
// Read the sentence
for (CoreNLPProtos.Token token : sentence.getTokenList()) {
CoreLabel coreLabel = fromProto(token);
// Set docid
if (proto.hasDocID()) {
coreLabel.setDocID(proto.getDocID());
}
if (token.hasTokenBeginIndex() && token.hasTokenEndIndex()) {
// This is usually true, if enough annotators are defined
while (tokens.size() < sentence.getTokenOffsetEnd()) {
tokens.add(null);
}
for (int i = token.getTokenBeginIndex(); i < token.getTokenEndIndex(); ++i) {
tokens.set(token.getTokenBeginIndex(), coreLabel);
}
} else {
// Assume this token spans a single token, and just add it to the tokens list
tokens.add(coreLabel);
}
}
}
} else if (proto.getSentencelessTokenCount() > 0) {
// Eek -- no sentences. Try to recover tokens directly
if (proto.getSentencelessTokenCount() > 0) {
for (CoreNLPProtos.Token token : proto.getSentencelessTokenList()) {
CoreLabel coreLabel = fromProto(token);
// Set docid
if (proto.hasDocID()) {
coreLabel.setDocID(proto.getDocID());
}
tokens.add(coreLabel);
}
}
}
if (!tokens.isEmpty()) {
ann.set(TokensAnnotation.class, tokens);
}
// Add sentences
List<CoreMap> sentences = new ArrayList<>(proto.getSentenceCount());
for (int sentIndex = 0; sentIndex < proto.getSentenceCount(); ++sentIndex) {
CoreNLPProtos.Sentence sentence = proto.getSentence(sentIndex);
CoreMap map = fromProtoNoTokens(sentence);
if (!tokens.isEmpty() && sentence.hasTokenOffsetBegin() && sentence.hasTokenOffsetEnd() && map.get(TokensAnnotation.class) == null) {
// Set tokens for sentence
int tokenBegin = sentence.getTokenOffsetBegin();
int tokenEnd = sentence.getTokenOffsetEnd();
assert tokenBegin <= tokens.size() && tokenBegin <= tokenEnd;
assert tokenEnd <= tokens.size();
map.set(TokensAnnotation.class, tokens.subList(tokenBegin, tokenEnd));
// Set sentence index + token index + paragraph index
for (int i = tokenBegin; i < tokenEnd; ++i) {
tokens.get(i).setSentIndex(sentIndex);
tokens.get(i).setIndex(i - sentence.getTokenOffsetBegin() + 1);
if (sentence.hasParagraph()) {
tokens.get(i).set(ParagraphAnnotation.class, sentence.getParagraph());
}
}
// Set text
int characterBegin = sentence.getCharacterOffsetBegin();
int characterEnd = sentence.getCharacterOffsetEnd();
if (characterEnd <= proto.getText().length()) {
// The usual case -- get the text from the document text
map.set(TextAnnotation.class, proto.getText().substring(characterBegin, characterEnd));
} else {
// The document text is wrong -- guess the text from the tokens
map.set(TextAnnotation.class, recoverOriginalText(tokens.subList(tokenBegin, tokenEnd), sentence));
}
}
// End iteration
sentences.add(map);
}
if (!sentences.isEmpty()) {
ann.set(SentencesAnnotation.class, sentences);
}
// Set DocID
String docid = null;
if (proto.hasDocID()) {
docid = proto.getDocID();
ann.set(DocIDAnnotation.class, docid);
}
// Set reference time
if (proto.hasDocDate()) {
ann.set(DocDateAnnotation.class, proto.getDocDate());
}
if (proto.hasCalendar()) {
GregorianCalendar calendar = new GregorianCalendar();
calendar.setTimeInMillis(proto.getCalendar());
ann.set(CalendarAnnotation.class, calendar);
}
// Set coref chain
Map<Integer, CorefChain> corefChains = new HashMap<>();
for (CoreNLPProtos.CorefChain chainProto : proto.getCorefChainList()) {
CorefChain chain = fromProto(chainProto, ann);
corefChains.put(chain.getChainID(), chain);
}
if (!corefChains.isEmpty()) {
ann.set(CorefChainAnnotation.class, corefChains);
}
// hashes to access Mentions , later in this method need to add speakerInfo to Mention
// so we need to create id -> Mention, CoreNLPProtos.Mention maps to do this, since SpeakerInfo could reference
// any Mention in doc
HashMap<Integer, Mention> idToMention = new HashMap<>();
HashMap<Integer, CoreNLPProtos.Mention> idToProtoMention = new HashMap<>();
// Set things in the sentence that need a document context.
for (int sentenceIndex = 0; sentenceIndex < proto.getSentenceCount(); ++sentenceIndex) {
CoreNLPProtos.Sentence sentence = proto.getSentenceList().get(sentenceIndex);
CoreMap map = sentences.get(sentenceIndex);
List<CoreLabel> sentenceTokens = map.get(TokensAnnotation.class);
// Set dependency graphs
if (sentence.hasBasicDependencies()) {
map.set(BasicDependenciesAnnotation.class, fromProto(sentence.getBasicDependencies(), sentenceTokens, docid));
}
if (sentence.hasCollapsedDependencies()) {
map.set(CollapsedDependenciesAnnotation.class, fromProto(sentence.getCollapsedDependencies(), sentenceTokens, docid));
}
if (sentence.hasCollapsedCCProcessedDependencies()) {
map.set(CollapsedCCProcessedDependenciesAnnotation.class, fromProto(sentence.getCollapsedCCProcessedDependencies(), sentenceTokens, docid));
}
if (sentence.hasAlternativeDependencies()) {
map.set(AlternativeDependenciesAnnotation.class, fromProto(sentence.getAlternativeDependencies(), sentenceTokens, docid));
}
if (sentence.hasEnhancedDependencies()) {
map.set(EnhancedDependenciesAnnotation.class, fromProto(sentence.getEnhancedDependencies(), sentenceTokens, docid));
}
if (sentence.hasEnhancedPlusPlusDependencies()) {
map.set(EnhancedPlusPlusDependenciesAnnotation.class, fromProto(sentence.getEnhancedPlusPlusDependencies(), sentenceTokens, docid));
}
// Set entailed sentences
if (sentence.getEntailedSentenceCount() > 0) {
Set<SentenceFragment> entailedSentences = sentence.getEntailedSentenceList().stream().map(frag -> fromProto(frag, map.get(EnhancedPlusPlusDependenciesAnnotation.class))).collect(Collectors.toSet());
map.set(NaturalLogicAnnotations.EntailedSentencesAnnotation.class, entailedSentences);
}
if (sentence.getEntailedClauseCount() > 0) {
Set<SentenceFragment> entailedClauses = sentence.getEntailedClauseList().stream().map(frag -> fromProto(frag, map.get(CollapsedDependenciesAnnotation.class))).collect(Collectors.toSet());
map.set(NaturalLogicAnnotations.EntailedClausesAnnotation.class, entailedClauses);
}
// Set relation triples
if (sentence.getOpenieTripleCount() > 0) {
List<RelationTriple> triples = new ArrayList<>();
for (CoreNLPProtos.RelationTriple triple : sentence.getOpenieTripleList()) {
triples.add(fromProto(triple, ann, sentenceIndex));
}
map.set(NaturalLogicAnnotations.RelationTriplesAnnotation.class, triples);
}
// Redo some light annotation
if (map.containsKey(TokensAnnotation.class) && (!sentence.hasHasNumerizedTokensAnnotation() || sentence.getHasNumerizedTokensAnnotation())) {
map.set(NumerizedTokensAnnotation.class, NumberNormalizer.findAndMergeNumbers(map));
}
// add the CoreLabel and IndexedWord info to each mention
// when Mentions are serialized, just storing the index in the sentence for CoreLabels and IndexedWords
// this is the point where the de-serialized sentence has tokens
int mentionInt = 0;
for (CoreNLPProtos.Mention protoMention : sentence.getMentionsForCorefList()) {
// get the mention
Mention mentionToUpdate = map.get(CorefMentionsAnnotation.class).get(mentionInt);
// store these in hash for more processing later in this method
idToMention.put(mentionToUpdate.mentionID, mentionToUpdate);
idToProtoMention.put(mentionToUpdate.mentionID, protoMention);
// update the values
int headIndexedWordIndex = protoMention.getHeadIndexedWord().getTokenIndex();
if (headIndexedWordIndex >= 0) {
mentionToUpdate.headIndexedWord = new IndexedWord(sentenceTokens.get(protoMention.getHeadIndexedWord().getTokenIndex()));
mentionToUpdate.headIndexedWord.setCopyCount(protoMention.getHeadIndexedWord().getCopyCount());
}
int dependingVerbIndex = protoMention.getDependingVerb().getTokenIndex();
if (dependingVerbIndex >= 0) {
mentionToUpdate.dependingVerb = new IndexedWord(sentenceTokens.get(protoMention.getDependingVerb().getTokenIndex()));
mentionToUpdate.dependingVerb.setCopyCount(protoMention.getDependingVerb().getCopyCount());
}
int headWordIndex = protoMention.getHeadWord().getTokenIndex();
if (headWordIndex >= 0) {
mentionToUpdate.headWord = sentenceTokens.get(protoMention.getHeadWord().getTokenIndex());
}
mentionToUpdate.sentenceWords = new ArrayList<>();
for (CoreNLPProtos.IndexedWord clp : protoMention.getSentenceWordsList()) {
int ti = clp.getTokenIndex();
mentionToUpdate.sentenceWords.add(sentenceTokens.get(ti));
}
mentionToUpdate.originalSpan = new ArrayList<>();
for (CoreNLPProtos.IndexedWord clp : protoMention.getOriginalSpanList()) {
int ti = clp.getTokenIndex();
mentionToUpdate.originalSpan.add(sentenceTokens.get(ti));
}
if (protoMention.getHasBasicDependency()) {
mentionToUpdate.basicDependency = map.get(BasicDependenciesAnnotation.class);
}
if (protoMention.getHasEnhancedDepenedncy()) {
mentionToUpdate.enhancedDependency = map.get(EnhancedDependenciesAnnotation.class);
}
if (protoMention.getHasContextParseTree()) {
mentionToUpdate.contextParseTree = map.get(TreeAnnotation.class);
}
// move on to next mention
mentionInt++;
}
}
// Set quotes
List<CoreMap> quotes = proto.getQuoteList().stream().map(quote -> fromProto(quote, tokens)).collect(Collectors.toList());
if (!quotes.isEmpty()) {
ann.set(QuotationsAnnotation.class, quotes);
}
// Set NERmention
List<CoreMap> mentions = proto.getMentionsList().stream().map(this::fromProto).collect(Collectors.toList());
if (!mentions.isEmpty()) {
ann.set(MentionsAnnotation.class, mentions);
}
// also add all the Set<Mention>
for (int mentionID : idToMention.keySet()) {
// this is the Mention message corresponding to this Mention
Mention mentionToUpdate = idToMention.get(mentionID);
CoreNLPProtos.Mention correspondingProtoMention = idToProtoMention.get(mentionID);
if (!correspondingProtoMention.hasSpeakerInfo()) {
// so just continue to next Mention
continue;
}
// if we're here we know a speakerInfo was stored
SpeakerInfo speakerInfo = fromProto(correspondingProtoMention.getSpeakerInfo());
// MentionID is ID in document, 0, 1, 2, etc...
for (int speakerInfoMentionID : correspondingProtoMention.getSpeakerInfo().getMentionsList()) {
speakerInfo.addMention(idToMention.get(speakerInfoMentionID));
}
// now the SpeakerInfo for this Mention should be fully restored
mentionToUpdate.speakerInfo = speakerInfo;
}
// Return
return ann;
}
use of edu.stanford.nlp.ling.IndexedWord in project CoreNLP by stanfordnlp.
the class CustomAnnotationSerializer method saveDependencyGraph.
/**
* Saves all arcs in the graph on two lines: first line contains the vertices, second the edges.
* @param graph
* @param pw
*/
private static void saveDependencyGraph(SemanticGraph graph, PrintWriter pw) {
if (graph == null) {
pw.println();
pw.println();
return;
}
boolean outputHeader = false;
for (IndexedWord node : graph.vertexSet()) {
// indicate: docid, sentence index
if (!outputHeader) {
String docId = node.get(CoreAnnotations.DocIDAnnotation.class);
if (docId != null && docId.length() > 0)
pw.print(docId);
else
pw.print("-");
pw.print("\t");
pw.print(node.get(CoreAnnotations.SentenceIndexAnnotation.class));
outputHeader = true;
}
pw.print("\t");
pw.print(node.index());
// These annotations are usually not set, so print them only if necessary
if (node.copyCount() > 0) {
pw.print("-");
pw.print(node.copyCount());
// System.out.println("FOUND COPY ANNOTATION: " + node.get(CoreAnnotations.CopyAnnotation.class));
}
if (graph.getRoots().contains(node)) {
if (node.copyCount() > 0) {
pw.print("-R");
} else {
pw.print("-0-R");
}
}
}
pw.println();
// second line: all edges
boolean first = true;
for (SemanticGraphEdge edge : graph.edgeIterable()) {
if (!first)
pw.print("\t");
String rel = edge.getRelation().toString();
// no spaces allowed in the relation name
// note that they might occur due to the tokenization of HTML/XML/RDF tags
rel = rel.replaceAll("\\s+", "");
pw.print(rel);
pw.print(" ");
pw.print(edge.getSource().index());
pw.print(" ");
pw.print(edge.getTarget().index());
if (edge.isExtra() || edge.getSource().copyCount() > 0 || edge.getTarget().copyCount() > 0) {
pw.print(" ");
pw.print(edge.isExtra());
pw.print(" ");
pw.print(edge.getSource().copyCount());
pw.print(" ");
pw.print(edge.getTarget().copyCount());
}
first = false;
}
pw.println();
}
use of edu.stanford.nlp.ling.IndexedWord in project CoreNLP by stanfordnlp.
the class CoNLLOutputter method print.
@Override
public void print(Annotation doc, OutputStream target, Options options) throws IOException {
PrintWriter writer = new PrintWriter(IOUtils.encodedOutputStreamWriter(target, options.encoding));
// vv A bunch of nonsense to get tokens vv
if (doc.get(CoreAnnotations.SentencesAnnotation.class) != null) {
for (CoreMap sentence : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
if (sentence.get(CoreAnnotations.TokensAnnotation.class) != null) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph depTree = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
for (int i = 0; i < tokens.size(); ++i) {
// Newline if applicable
if (i > 0) {
writer.println();
}
// Try to get the incoming dependency edge
int head = -1;
String deprel = null;
if (depTree != null) {
Set<Integer> rootSet = depTree.getRoots().stream().map(IndexedWord::index).collect(Collectors.toSet());
IndexedWord node = depTree.getNodeByIndexSafe(i + 1);
if (node != null) {
List<SemanticGraphEdge> edgeList = depTree.getIncomingEdgesSorted(node);
if (!edgeList.isEmpty()) {
assert edgeList.size() == 1;
head = edgeList.get(0).getGovernor().index();
deprel = edgeList.get(0).getRelation().toString();
} else if (rootSet.contains(i + 1)) {
head = 0;
deprel = "ROOT";
}
}
}
// Write the token
writer.print(line(i + 1, tokens.get(i), head, deprel));
}
}
writer.println();
writer.println();
}
}
writer.flush();
}
Aggregations