use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class TextTaggedFileReader method primeNext.
void primeNext() {
String line;
try {
line = reader.readLine();
} catch (IOException e) {
throw new RuntimeException(e);
}
if (line == null) {
next = null;
return;
}
++numSentences;
next = new ArrayList<>();
StringTokenizer st = new StringTokenizer(line);
while (st.hasMoreTokens()) {
String token = st.nextToken();
int indexUnd = token.lastIndexOf(tagSeparator);
if (indexUnd < 0) {
throw new IllegalArgumentException("Data format error: can't find delimiter \"" + tagSeparator + "\" in word \"" + token + "\" (line " + (numSentences + 1) + " of " + filename + ')');
}
String word = token.substring(0, indexUnd).intern();
String tag = token.substring(indexUnd + 1).intern();
next.add(new TaggedWord(word, tag));
}
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class ChineseCharacterBasedLexicon method finishTraining.
@Override
public void finishTraining() {
Timing.tick("Counting characters...");
ClassicCounter<Symbol> charCounter = new ClassicCounter<>();
// first find all chars that occur only once
for (List<TaggedWord> labels : trainingSentences) {
for (TaggedWord label : labels) {
String word = label.word();
if (word.equals(BOUNDARY)) {
continue;
}
for (int j = 0, length = word.length(); j < length; j++) {
Symbol sym = Symbol.cannonicalSymbol(word.charAt(j));
charCounter.incrementCount(sym);
}
charCounter.incrementCount(Symbol.END_WORD);
}
}
Set<Symbol> singletons = Counters.keysBelow(charCounter, 1.5);
knownChars = Generics.newHashSet(charCounter.keySet());
Timing.tick("Counting nGrams...");
GeneralizedCounter[] POSspecificCharNGrams = new GeneralizedCounter[CONTEXT_LENGTH + 1];
for (int i = 0; i <= CONTEXT_LENGTH; i++) {
POSspecificCharNGrams[i] = new GeneralizedCounter(i + 2);
}
ClassicCounter<String> POSCounter = new ClassicCounter<>();
List<Serializable> context = new ArrayList<>(CONTEXT_LENGTH + 1);
for (List<TaggedWord> words : trainingSentences) {
for (TaggedWord taggedWord : words) {
String word = taggedWord.word();
String tag = taggedWord.tag();
tagIndex.add(tag);
if (word.equals(BOUNDARY)) {
continue;
}
POSCounter.incrementCount(tag);
for (int i = 0, size = word.length(); i <= size; i++) {
Symbol sym;
Symbol unknownCharClass = null;
context.clear();
context.add(tag);
if (i < size) {
char thisCh = word.charAt(i);
sym = Symbol.cannonicalSymbol(thisCh);
if (singletons.contains(sym)) {
unknownCharClass = unknownCharClass(sym);
charCounter.incrementCount(unknownCharClass);
}
} else {
sym = Symbol.END_WORD;
}
// POS-specific 1-gram
POSspecificCharNGrams[0].incrementCount(context, sym);
if (unknownCharClass != null) {
// for unknown ch model
POSspecificCharNGrams[0].incrementCount(context, unknownCharClass);
}
// this could be made faster using .sublist like in score
for (int j = 1; j <= CONTEXT_LENGTH; j++) {
// poly grams
if (i - j < 0) {
context.add(Symbol.BEGIN_WORD);
POSspecificCharNGrams[j].incrementCount(context, sym);
if (unknownCharClass != null) {
// for unknown ch model
POSspecificCharNGrams[j].incrementCount(context, unknownCharClass);
}
break;
} else {
Symbol prev = Symbol.cannonicalSymbol(word.charAt(i - j));
if (singletons.contains(prev)) {
context.add(unknownCharClass(prev));
} else {
context.add(prev);
}
POSspecificCharNGrams[j].incrementCount(context, sym);
if (unknownCharClass != null) {
// for unknown ch model
POSspecificCharNGrams[j].incrementCount(context, unknownCharClass);
}
}
}
}
}
}
POSDistribution = Distribution.getDistribution(POSCounter);
Timing.tick("Creating character prior distribution...");
charDistributions = Generics.newHashMap();
// charDistributions = Generics.newHashMap(); // 1.5
// charCounter.incrementCount(Symbol.UNKNOWN, singletons.size());
int numberOfKeys = charCounter.size() + singletons.size();
Distribution<Symbol> prior = Distribution.goodTuringSmoothedCounter(charCounter, numberOfKeys);
charDistributions.put(Collections.EMPTY_LIST, prior);
for (int i = 0; i <= CONTEXT_LENGTH; i++) {
Set<Map.Entry<List<Serializable>, ClassicCounter<Symbol>>> counterEntries = POSspecificCharNGrams[i].lowestLevelCounterEntrySet();
Timing.tick("Creating " + counterEntries.size() + " character " + (i + 1) + "-gram distributions...");
for (Map.Entry<List<Serializable>, ClassicCounter<Symbol>> entry : counterEntries) {
context = entry.getKey();
ClassicCounter<Symbol> c = entry.getValue();
Distribution<Symbol> thisPrior = charDistributions.get(context.subList(0, context.size() - 1));
double priorWeight = thisPrior.getNumberOfKeys() / 200.0;
Distribution<Symbol> newDist = Distribution.dynamicCounterWithDirichletPrior(c, thisPrior, priorWeight);
charDistributions.put(context, newDist);
}
}
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class ChineseMaxentLexicon method train.
/**
* Add the given sentence to the statistics counted. Can
* be called multiple times with different sentences.
*/
@Override
public void train(List<TaggedWord> sentence, double weight) {
featExtractor.train(sentence, weight);
for (TaggedWord word : sentence) {
datumCounter.incrementCount(word, weight);
tagsForWord.add(word.word(), word.tag());
}
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class ChineseWordFeatureExtractor method train.
public void train(List<TaggedWord> sentence, double weight) {
for (TaggedWord word : sentence) {
String wordString = word.word();
wordCounter.incrementCount(wordString, weight);
}
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class BaseLexicon method trainUnannotated.
@Override
public final void trainUnannotated(List<TaggedWord> sentence, double weight) {
uwModelTrainer.incrementTreesRead(weight);
int loc = 0;
for (TaggedWord tw : sentence) {
String baseTag = op.langpack().basicCategory(tw.tag());
Counter<String> counts = baseTagCounts.get(baseTag);
if (counts == null) {
++loc;
continue;
}
double totalCount = counts.totalCount();
if (totalCount == 0) {
++loc;
continue;
}
for (String tag : counts.keySet()) {
TaggedWord newTW = new TaggedWord(tw.word(), tag);
train(newTW, loc, weight * counts.getCount(tag) / totalCount);
}
++loc;
}
}
Aggregations