use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class Util method writeConllFile.
public static void writeConllFile(String outFile, List<CoreMap> sentences, List<DependencyTree> trees) {
try {
PrintWriter output = IOUtils.getPrintWriter(outFile);
for (int i = 0; i < sentences.size(); i++) {
CoreMap sentence = sentences.get(i);
DependencyTree tree = trees.get(i);
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
for (int j = 1, size = tokens.size(); j <= size; ++j) {
CoreLabel token = tokens.get(j - 1);
output.printf("%d\t%s\t_\t%s\t%s\t_\t%d\t%s\t_\t_%n", j, token.word(), token.tag(), token.tag(), tree.getHead(j), tree.getLabel(j));
}
output.println();
}
output.close();
} catch (Exception e) {
throw new RuntimeIOException(e);
}
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class ParserDemo method demoAPI.
/**
* demoAPI demonstrates other ways of calling the parser with
* already tokenized text, or in some cases, raw text that needs to
* be tokenized as a single sentence. Output is handled with a
* TreePrint object. Note that the options used when creating the
* TreePrint can determine what results to print out. Once again,
* one can capture the output by passing a PrintWriter to
* TreePrint.printTree. This code is for English.
*/
public static void demoAPI(LexicalizedParser lp) {
// This option shows parsing a list of correctly tokenized words
String[] sent = { "This", "is", "an", "easy", "sentence", "." };
List<CoreLabel> rawWords = SentenceUtils.toCoreLabelList(sent);
Tree parse = lp.apply(rawWords);
parse.pennPrint();
System.out.println();
// This option shows loading and using an explicit tokenizer
String sent2 = "This is another sentence.";
TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sent2));
List<CoreLabel> rawWords2 = tok.tokenize();
parse = lp.apply(rawWords2);
// PennTreebankLanguagePack for English
TreebankLanguagePack tlp = lp.treebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
System.out.println();
// You can also use a TreePrint object to print trees and dependencies
TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
tp.printTree(parse);
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class Tdiff method markDiff.
/**
* Marks bracketings in t2 not in t1 using the DoAnnotation field.
* Returns a list of brackets in t1 not in t2.
*
* @param t1
* @param t2
* @return A list of brackets in t1 not in t2;
*/
public static Set<Constituent> markDiff(Tree t1, Tree t2) {
// if (t1 == null || t2 == null || ! t1.value().equals(t2.value())) {
// System.err.printf("t1 value is %s; t2 value is %s; t1 is %s t2 is %s", t1.value(), t2.value(), t1, t2);
// }
Set<Constituent> t1Labels = (t1 == null) ? Generics.<Constituent>newHashSet() : t1.constituents(cf);
if (t2 != null) {
t2.setSpans();
for (Tree subTree : t2) {
if (subTree.isPhrasal()) {
IntPair span = subTree.getSpan();
Constituent c = cf.newConstituent(span.getSource(), span.getTarget(), subTree.label(), 0.0);
if (t1Labels.contains(c)) {
t1Labels.remove(c);
((CoreLabel) subTree.label()).set(CoreAnnotations.DoAnnotation.class, false);
} else {
((CoreLabel) subTree.label()).set(CoreAnnotations.DoAnnotation.class, true);
}
}
}
}
return t1Labels;
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class ATBTreeUtils method taggedStringFromTree.
/**
* Converts a parse tree into a string of tokens. Each token is a word and
* its POS tag separated by the delimiter specified by <code>separator</code>
*
* @param t - A parse tree
* @param removeEscaping - If true, remove LDC escape characters. Otherwise, leave them.
* @param separator Word/tag separator
* @return A string of tagged words
*/
public static String taggedStringFromTree(Tree t, boolean removeEscaping, String separator) {
t = t.prune(emptyFilter, tf);
List<CoreLabel> taggedSentence = t.taggedLabeledYield();
for (CoreLabel token : taggedSentence) {
String word = (removeEscaping) ? unEscape(token.word()) : token.word();
token.setWord(word);
token.setValue(word);
}
return SentenceUtils.listToString(taggedSentence, false, separator);
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class ArabicTreeNormalizer method normalizeWholeTree.
@Override
public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
tree = tree.prune(emptyFilter, tf).spliceOut(aOverAFilter, tf);
for (Tree t : tree) {
if (t.isLeaf()) {
//specified by HasContext.
if (t.value().contains(MorphoFeatureSpecification.MORPHO_MARK)) {
String[] toks = t.value().split(MorphoFeatureSpecification.MORPHO_MARK);
if (toks.length != 2)
System.err.printf("%s: Word contains malformed morph annotation: %s%n", this.getClass().getName(), t.value());
else if (t.label() instanceof CoreLabel) {
((CoreLabel) t.label()).setValue(toks[0].trim().intern());
((CoreLabel) t.label()).setWord(toks[0].trim().intern());
Pair<String, String> lemmaMorph = MorphoFeatureSpecification.splitMorphString(toks[0], toks[1]);
String lemma = lemmaMorph.first();
String morphAnalysis = lemmaMorph.second();
if (lemma.equals(toks[0])) {
((CoreLabel) t.label()).setOriginalText(toks[1].trim().intern());
} else {
// TODO(speneg): Does this help?
String newLemma = lexMapper.map(null, lemma);
if (newLemma == null || newLemma.trim().length() == 0) {
newLemma = lemma;
}
String newMorphAnalysis = newLemma + MorphoFeatureSpecification.LEMMA_MARK + morphAnalysis;
((CoreLabel) t.label()).setOriginalText(newMorphAnalysis.intern());
}
} else {
System.err.printf("%s: Cannot store morph analysis in non-CoreLabel: %s%n", this.getClass().getName(), t.label().getClass().getName());
}
}
} else if (t.isPreTerminal()) {
if (t.value() == null || t.value().equals("")) {
System.err.printf("%s: missing tag for\n%s\n", this.getClass().getName(), t.pennString());
} else if (t.label() instanceof HasTag) {
((HasTag) t.label()).setTag(t.value());
}
} else {
//Phrasal nodes
// there are some nodes "/" missing preterminals. We'll splice in a tag for these.
int nk = t.numChildren();
List<Tree> newKids = new ArrayList<>(nk);
for (int j = 0; j < nk; j++) {
Tree child = t.getChild(j);
if (child.isLeaf()) {
System.err.printf("%s: Splicing in DUMMYTAG for%n%s%n", this.getClass().getName(), t.toString());
newKids.add(tf.newTreeNode("DUMMYTAG", Collections.singletonList(child)));
} else {
newKids.add(child);
}
}
t.setChildren(newKids);
}
}
// special global coding for moving PRD annotation from constituent to verb tag.
if (markPRDverb) {
TregexMatcher m = prdVerbPattern.matcher(tree);
Tree match = null;
while (m.find()) {
if (m.getMatch() != match) {
match = m.getMatch();
match.label().setValue(match.label().value() + "-PRDverb");
Tree prd = m.getNode("prd");
prd.label().setValue(super.normalizeNonterminal(prd.label().value()));
}
}
}
//Mark *only* subjects in verb-initial clauses
if (retainNPSbj) {
TregexMatcher m = npSbjPattern.matcher(tree);
while (m.find()) {
Tree match = m.getMatch();
match.label().setValue("NP");
}
}
if (tree.isPreTerminal()) {
// The whole tree is a bare tag: bad!
String val = tree.label().value();
if (val.equals("CC") || val.startsWith("PUNC") || val.equals("CONJ")) {
System.err.printf("%s: Bare tagged word being wrapped in FRAG\n%s\n", this.getClass().getName(), tree.pennString());
tree = tf.newTreeNode("FRAG", Collections.singletonList(tree));
} else {
System.err.printf("%s: Bare tagged word\n%s\n", this.getClass().getName(), tree.pennString());
}
}
//will return null. In this case, readers e.g. PennTreeReader will try to read the next tree.
while (tree != null && (tree.value() == null || tree.value().equals("")) && tree.numChildren() <= 1) tree = tree.firstChild();
if (tree != null && !tree.value().equals(rootLabel))
tree = tf.newTreeNode(rootLabel, Collections.singletonList(tree));
return tree;
}
Aggregations