use of edu.stanford.nlp.ling.Label in project CoreNLP by stanfordnlp.
the class FactoredLexicon method train.
/**
* This method should populate wordIndex, tagIndex, and morphIndex.
*/
@Override
public void train(Collection<Tree> trees, Collection<Tree> rawTrees) {
double weight = 1.0;
// Train uw model on words
uwModelTrainer.train(trees, weight);
final double numTrees = trees.size();
Iterator<Tree> rawTreesItr = rawTrees == null ? null : rawTrees.iterator();
Iterator<Tree> treeItr = trees.iterator();
// Train factored lexicon on lemmas and morph tags
int treeId = 0;
while (treeItr.hasNext()) {
Tree tree = treeItr.next();
// CoreLabels, with morph analysis in the originalText annotation
List<Label> yield = rawTrees == null ? tree.yield() : rawTreesItr.next().yield();
// Annotated, binarized tree for the tags (labels are usually CategoryWordTag)
List<Label> pretermYield = tree.preTerminalYield();
int yieldLen = yield.size();
for (int i = 0; i < yieldLen; ++i) {
String word = yield.get(i).value();
// Don't do anything with words
int wordId = wordIndex.addToIndex(word);
String tag = pretermYield.get(i).value();
int tagId = tagIndex.addToIndex(tag);
// Use the word as backup if there is no lemma
String featureStr = ((CoreLabel) yield.get(i)).originalText();
Pair<String, String> lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, featureStr);
String lemma = lemmaMorph.first();
int lemmaId = wordIndex.addToIndex(lemma);
String richMorphTag = lemmaMorph.second();
String reducedMorphTag = morphoSpec.strToFeatures(richMorphTag).toString().trim();
reducedMorphTag = reducedMorphTag.isEmpty() ? NO_MORPH_ANALYSIS : reducedMorphTag;
int morphId = morphIndex.addToIndex(reducedMorphTag);
// Seen event counts
wordTag.incrementCount(wordId, tagId);
lemmaTag.incrementCount(lemmaId, tagId);
morphTag.incrementCount(morphId, tagId);
tagCounter.incrementCount(tagId);
// Unseen event counts
if (treeId > op.trainOptions.fractionBeforeUnseenCounting * numTrees) {
if (!wordTag.firstKeySet().contains(wordId) || wordTag.getCounter(wordId).totalCount() < 2) {
wordTagUnseen.incrementCount(tagId);
}
if (!lemmaTag.firstKeySet().contains(lemmaId) || lemmaTag.getCounter(lemmaId).totalCount() < 2) {
lemmaTagUnseen.incrementCount(tagId);
}
if (!morphTag.firstKeySet().contains(morphId) || morphTag.getCounter(morphId).totalCount() < 2) {
morphTagUnseen.incrementCount(tagId);
}
}
}
++treeId;
if (DEBUG && (treeId % 100) == 0) {
System.err.printf("[%d]", treeId);
}
if (DEBUG && (treeId % 10000) == 0) {
log.info();
}
}
}
use of edu.stanford.nlp.ling.Label in project CoreNLP by stanfordnlp.
the class ExhaustiveDependencyParser method extractBestParse.
/**
* Find the best (partial) parse within the parameter constraints.
* @param start Sentence index of start of span (fenceposts, from 0 up)
* @param end Sentence index of end of span (right side fencepost)
* @param hWord Sentence index of head word (left side fencepost)
* @param hTag Tag assigned to hWord
* @return The best parse tree within the parameter constraints
*/
private Tree extractBestParse(int start, int end, int hWord, int hTag) {
if (DEBUG) {
log.info("Span " + start + " to " + end + " word " + wordIndex.get(words[hWord]) + "/" + hWord + " tag " + tagIndex.get(hTag) + "/" + hTag + " score " + iScore(start, end, hWord, hTag));
}
String headWordStr = wordIndex.get(words[hWord]);
String headTagStr = tagIndex.get(hTag);
Label headLabel = new CategoryWordTag(headWordStr, headWordStr, headTagStr);
int numTags = tagIndex.size();
// deal with span 1
if (end - start == 1) {
Tree leaf = tf.newLeaf(new Word(headWordStr));
return tf.newTreeNode(headLabel, Collections.singletonList(leaf));
}
// find backtrace
List<Tree> children = new ArrayList<>();
double bestScore = iScore(start, end, hWord, hTag);
for (int split = start + 1; split < end; split++) {
int binD = binDistance[hWord][split];
if (hWord < split) {
for (int aWord = split; aWord < end; aWord++) {
for (int aTag = 0; aTag < numTags; aTag++) {
if (matches(iScore(start, split, hWord, hTag) + iScore(split, end, aWord, aTag) + headScore[binD][hWord][dg.tagBin(hTag)][aWord][dg.tagBin(aTag)] + headStop[aWord][dg.tagBin(aTag)][split] + headStop[aWord][dg.tagBin(aTag)][end], bestScore)) {
if (DEBUG) {
String argWordStr = wordIndex.get(words[aWord]);
String argTagStr = tagIndex.get(aTag);
log.info(headWordStr + "|" + headTagStr + " -> " + argWordStr + "|" + argTagStr + " " + bestScore);
}
// build it
children.add(extractBestParse(start, split, hWord, hTag));
children.add(extractBestParse(split, end, aWord, aTag));
return tf.newTreeNode(headLabel, children);
}
}
}
} else {
for (int aWord = start; aWord < split; aWord++) {
for (int aTag = 0; aTag < numTags; aTag++) {
if (matches(iScore(start, split, aWord, aTag) + iScore(split, end, hWord, hTag) + headScore[binD][hWord][dg.tagBin(hTag)][aWord][dg.tagBin(aTag)] + headStop[aWord][dg.tagBin(aTag)][start] + headStop[aWord][dg.tagBin(aTag)][split], bestScore)) {
if (DEBUG) {
String argWordStr = wordIndex.get(words[aWord]);
String argTagStr = tagIndex.get(aTag);
log.info(headWordStr + "|" + headTagStr + " -> " + argWordStr + "|" + argTagStr + " " + bestScore);
}
children.add(extractBestParse(start, split, aWord, aTag));
children.add(extractBestParse(split, end, hWord, hTag));
// build it
return tf.newTreeNode(headLabel, children);
}
}
}
}
}
log.info("Problem in ExhaustiveDependencyParser::extractBestParse");
return null;
}
use of edu.stanford.nlp.ling.Label in project CoreNLP by stanfordnlp.
the class BaseUnknownWordModel method score.
// todo [cdm 2010]: Recheck that this method really does the right thing in making a P(W|T) estimate....
public float score(IntTaggedWord itw, String word) {
float logProb;
// Label tag = itw.tagLabel();
String tagStr = itw.tagString(tagIndex);
Label tag = new Tag(tagStr);
if (useEnd || useFirst || useFirstCap) {
// The getSignature here doesn't use sentence position
String end = getSignature(word, -1);
if (useGT && !seenEnd.contains(end)) {
logProb = scoreGT(tagStr);
} else {
if (!seenEnd.contains(end)) {
end = unknown;
}
// System.out.println("using end-character model for for unknown word "+ word + " for tag " + tag);
/* get the Counter of terminal rewrites for the relevant tag */
ClassicCounter<String> wordProbs = tagHash.get(tag);
/* if the proposed tag has never been seen before, issue a
* warning and return probability 0
*/
if (wordProbs == null) {
log.info("Warning: proposed tag is unseen in training data:\t" + tagStr);
logProb = Float.NEGATIVE_INFINITY;
} else if (wordProbs.keySet().contains(end)) {
logProb = (float) wordProbs.getCount(end);
} else {
logProb = (float) wordProbs.getCount(unknown);
}
}
} else if (useGT) {
logProb = scoreGT(tagStr);
} else {
log.info("Warning: no unknown word model in place!\nGiving the combination " + word + ' ' + tagStr + " zero probability.");
// should never get this!
logProb = Float.NEGATIVE_INFINITY;
}
// EncodingPrintWriter.out.println("Unknown word estimate for " + word + " as " + tag + ": " + logProb,encoding); //debugging
return logProb;
}
use of edu.stanford.nlp.ling.Label in project CoreNLP by stanfordnlp.
the class BaseUnknownWordModelTrainer method finishTraining.
@Override
public UnknownWordModel finishTraining() {
if (useGT) {
unknownGTTrainer.finishTraining();
}
for (Map.Entry<Label, ClassicCounter<String>> entry : c.entrySet()) {
/* outer iteration is over tags */
Label key = entry.getKey();
// counts for words given a tag
ClassicCounter<String> wc = entry.getValue();
if (!tagHash.containsKey(key)) {
tagHash.put(key, new ClassicCounter<>());
}
/* the UNKNOWN sequence is assumed to be seen once in each tag */
// This is sort of broken, but you can regard it as a Dirichlet prior.
tc.incrementCount(key);
wc.setCount(unknown, 1.0);
/* inner iteration is over words */
for (Map.Entry<String, Double> wEntry : wc.entrySet()) {
String end = wEntry.getKey();
// p(sig|tag)
double prob = Math.log(wEntry.getValue() / tc.getCount(key));
tagHash.get(key).setCount(end, prob);
// if (Test.verbose)
// EncodingPrintWriter.out.println(tag + " rewrites as " + end + " endchar with probability " + prob,encoding);
}
}
return model;
}
use of edu.stanford.nlp.ling.Label in project CoreNLP by stanfordnlp.
the class ChineseUnknownWordModelTrainer method finishTraining.
@Override
public UnknownWordModel finishTraining() {
// Map<String,Float> unknownGT = null;
if (useGT) {
unknownGTTrainer.finishTraining();
// unknownGT = unknownGTTrainer.unknownGT;
}
for (Label tagLab : c.keySet()) {
// outer iteration is over tags as Labels
// counts for words given a tag
ClassicCounter<String> wc = c.get(tagLab);
if (!tagHash.containsKey(tagLab)) {
tagHash.put(tagLab, new ClassicCounter<>());
}
// the UNKNOWN first character is assumed to be seen once in
// each tag
// this is really sort of broken! (why??)
tc.incrementCount(tagLab);
wc.setCount(unknown, 1.0);
// inner iteration is over words as strings
for (String first : wc.keySet()) {
double prob = Math.log(((wc.getCount(first))) / tc.getCount(tagLab));
tagHash.get(tagLab).setCount(first, prob);
// if (Test.verbose)
// EncodingPrintWriter.out.println(tag + " rewrites as " + first + " first char with probability " + prob,encoding);
}
}
return model;
}
Aggregations