use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class TreeSpanScoring method countSpanErrors.
/**
* Counts how many spans are present in goldTree, including
* preterminals, but not present in guessTree, along with how many
* spans are present in guessTree and not goldTree. Each one counts
* as an error, meaning that something like a mislabeled span or
* preterminal counts as two errors.
* <br>
* Span labels are compared using the basicCategory() function
* from the passed in TreebankLanguagePack.
*/
public static int countSpanErrors(TreebankLanguagePack tlp, Tree goldTree, Tree guessTree) {
Set<Constituent> goldConstituents = goldTree.constituents(LabeledConstituent.factory());
Set<Constituent> guessConstituents = guessTree.constituents(LabeledConstituent.factory());
Set<Constituent> simpleGoldConstituents = simplifyConstituents(tlp, goldConstituents);
Set<Constituent> simpleGuessConstituents = simplifyConstituents(tlp, guessConstituents);
//System.out.println(simpleGoldConstituents);
//System.out.println(simpleGuessConstituents);
int errors = 0;
for (Constituent gold : simpleGoldConstituents) {
if (!simpleGuessConstituents.contains(gold)) {
++errors;
}
}
for (Constituent guess : simpleGuessConstituents) {
if (!simpleGoldConstituents.contains(guess)) {
++errors;
}
}
// The spans returned by constituents() doesn't include the
// preterminals, so we need to count those ourselves now
List<TaggedWord> goldWords = goldTree.taggedYield();
List<TaggedWord> guessWords = guessTree.taggedYield();
int len = Math.min(goldWords.size(), guessWords.size());
for (int i = 0; i < len; ++i) {
String goldTag = tlp.basicCategory(goldWords.get(i).tag());
String guessTag = tlp.basicCategory(guessWords.get(i).tag());
if (!goldTag.equals(guessTag)) {
// we count one error for each span that is present in the
// gold and not in the guess, and one error for each span that
// is present in the guess and not the gold, so this counts as
// two errors
errors += 2;
}
}
return errors;
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class DependencyParser method parseTextFile.
private void parseTextFile(BufferedReader input, PrintWriter output) {
DocumentPreprocessor preprocessor = new DocumentPreprocessor(input);
preprocessor.setSentenceFinalPuncWords(config.tlp.sentenceFinalPunctuationWords());
preprocessor.setEscaper(config.escaper);
preprocessor.setSentenceDelimiter(config.sentenceDelimiter);
preprocessor.setTokenizerFactory(config.tlp.getTokenizerFactory());
Timing timer = new Timing();
MaxentTagger tagger = new MaxentTagger(config.tagger);
List<List<TaggedWord>> tagged = new ArrayList<>();
for (List<HasWord> sentence : preprocessor) {
tagged.add(tagger.tagSentence(sentence));
}
System.err.printf("Tagging completed in %.2f sec.%n", timer.stop() / 1000.0);
timer.start();
int numSentences = 0;
for (List<TaggedWord> taggedSentence : tagged) {
GrammaticalStructure parse = predict(taggedSentence);
Collection<TypedDependency> deps = parse.typedDependencies();
for (TypedDependency dep : deps) output.println(dep);
output.println();
numSentences++;
}
long millis = timer.stop();
double seconds = millis / 1000.0;
System.err.printf("Parsed %d sentences in %.2f seconds (%.2f sents/sec).%n", numSentences, seconds, numSentences / seconds);
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class DependencyParserDemo method main.
public static void main(String[] args) {
String modelPath = DependencyParser.DEFAULT_MODEL;
String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
for (int argIndex = 0; argIndex < args.length; ) {
switch(args[argIndex]) {
case "-tagger":
taggerPath = args[argIndex + 1];
argIndex += 2;
break;
case "-model":
modelPath = args[argIndex + 1];
argIndex += 2;
break;
default:
throw new RuntimeException("Unknown argument " + args[argIndex]);
}
}
String text = "I can almost always tell when movies use fake dinosaurs.";
MaxentTagger tagger = new MaxentTagger(taggerPath);
DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);
DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
for (List<HasWord> sentence : tokenizer) {
List<TaggedWord> tagged = tagger.tagSentence(sentence);
GrammaticalStructure gs = parser.predict(tagged);
// Print typed dependencies
log.info(gs);
}
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class SplittingGrammarExtractor method initialBetasAndLexicon.
private int initialBetasAndLexicon(Tree tree, int position, double weight) {
if (tree.isLeaf()) {
// should never get here, unless a training tree is just one leaf
return position;
}
if (tree.isPreTerminal()) {
// fill in initial lexicon here
String tag = tree.label().value();
String word = tree.children()[0].label().value();
TaggedWord tw = new TaggedWord(word, state(tag, 0));
lex.train(tw, position, weight);
return (position + 1);
}
if (tree.children().length == 2) {
String label = tree.label().value();
String leftLabel = tree.getChild(0).label().value();
String rightLabel = tree.getChild(1).label().value();
if (!binaryBetas.contains(label, leftLabel, rightLabel)) {
double[][][] map = new double[1][1][1];
map[0][0][0] = 0.0;
binaryBetas.put(label, leftLabel, rightLabel, map);
}
} else if (tree.children().length == 1) {
String label = tree.label().value();
String childLabel = tree.getChild(0).label().value();
if (!unaryBetas.contains(label, childLabel)) {
double[][] map = new double[1][1];
map[0][0] = 0.0;
unaryBetas.put(label, childLabel, map);
}
} else {
// should have been binarized
throw new RuntimeException("Trees should have been binarized, expected 1 or 2 children");
}
for (Tree child : tree.children()) {
position = initialBetasAndLexicon(child, position, weight);
}
return position;
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class SplittingGrammarExtractor method recalculateTemporaryBetas.
public int recalculateTemporaryBetas(Tree tree, double[] stateWeights, int position, IdentityHashMap<Tree, double[][]> unaryTransitions, IdentityHashMap<Tree, double[][][]> binaryTransitions, Map<String, double[]> totalStateMass, TwoDimensionalMap<String, String, double[][]> tempUnaryBetas, ThreeDimensionalMap<String, String, String, double[][][]> tempBinaryBetas) {
if (tree.isLeaf()) {
// possible to get here if we have a tree with no structure
return position;
}
if (totalStateMass != null) {
double[] stateTotal = totalStateMass.get(tree.label().value());
if (stateTotal == null) {
stateTotal = new double[stateWeights.length];
totalStateMass.put(tree.label().value(), stateTotal);
}
for (int i = 0; i < stateWeights.length; ++i) {
stateTotal[i] += Math.exp(stateWeights[i]);
}
}
if (tree.isPreTerminal()) {
// fill in our new lexicon here.
String tag = tree.label().value();
String word = tree.children()[0].label().value();
// We smooth by LEX_SMOOTH, if relevant. We rescale so that sum
// of the weights being added to the lexicon stays the same.
double total = 0.0;
for (double stateWeight : stateWeights) {
total += Math.exp(stateWeight);
}
if (total <= 0.0) {
return position + 1;
}
double scale = 1.0 / (1.0 + LEX_SMOOTH);
double smoothing = total * LEX_SMOOTH / stateWeights.length;
for (int state = 0; state < stateWeights.length; ++state) {
// TODO: maybe optimize all this TaggedWord creation
TaggedWord tw = new TaggedWord(word, state(tag, state));
tempLex.train(tw, position, (Math.exp(stateWeights[state]) + smoothing) * scale);
}
return position + 1;
}
if (tree.children().length == 1) {
String parentLabel = tree.label().value();
String childLabel = tree.children()[0].label().value();
double[][] transitions = unaryTransitions.get(tree);
int parentStates = transitions.length;
int childStates = transitions[0].length;
double[][] betas = tempUnaryBetas.get(parentLabel, childLabel);
if (betas == null) {
betas = new double[parentStates][childStates];
for (int i = 0; i < parentStates; ++i) {
for (int j = 0; j < childStates; ++j) {
betas[i][j] = Double.NEGATIVE_INFINITY;
}
}
tempUnaryBetas.put(parentLabel, childLabel, betas);
}
double[] childWeights = neginfDoubles(childStates);
for (int i = 0; i < parentStates; ++i) {
for (int j = 0; j < childStates; ++j) {
double weight = transitions[i][j];
betas[i][j] = SloppyMath.logAdd(betas[i][j], weight + stateWeights[i]);
childWeights[j] = SloppyMath.logAdd(childWeights[j], weight + stateWeights[i]);
}
}
position = recalculateTemporaryBetas(tree.children()[0], childWeights, position, unaryTransitions, binaryTransitions, totalStateMass, tempUnaryBetas, tempBinaryBetas);
} else {
// length == 2
String parentLabel = tree.label().value();
String leftLabel = tree.children()[0].label().value();
String rightLabel = tree.children()[1].label().value();
double[][][] transitions = binaryTransitions.get(tree);
int parentStates = transitions.length;
int leftStates = transitions[0].length;
int rightStates = transitions[0][0].length;
double[][][] betas = tempBinaryBetas.get(parentLabel, leftLabel, rightLabel);
if (betas == null) {
betas = new double[parentStates][leftStates][rightStates];
for (int i = 0; i < parentStates; ++i) {
for (int j = 0; j < leftStates; ++j) {
for (int k = 0; k < rightStates; ++k) {
betas[i][j][k] = Double.NEGATIVE_INFINITY;
}
}
}
tempBinaryBetas.put(parentLabel, leftLabel, rightLabel, betas);
}
double[] leftWeights = neginfDoubles(leftStates);
double[] rightWeights = neginfDoubles(rightStates);
for (int i = 0; i < parentStates; ++i) {
for (int j = 0; j < leftStates; ++j) {
for (int k = 0; k < rightStates; ++k) {
double weight = transitions[i][j][k];
betas[i][j][k] = SloppyMath.logAdd(betas[i][j][k], weight + stateWeights[i]);
leftWeights[j] = SloppyMath.logAdd(leftWeights[j], weight + stateWeights[i]);
rightWeights[k] = SloppyMath.logAdd(rightWeights[k], weight + stateWeights[i]);
}
}
}
position = recalculateTemporaryBetas(tree.children()[0], leftWeights, position, unaryTransitions, binaryTransitions, totalStateMass, tempUnaryBetas, tempBinaryBetas);
position = recalculateTemporaryBetas(tree.children()[1], rightWeights, position, unaryTransitions, binaryTransitions, totalStateMass, tempUnaryBetas, tempBinaryBetas);
}
return position;
}
Aggregations