use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class FindTreebankTree method main.
public static void main(String[] args) {
// Args specified with -tagSeparator, -encoding, etc are assigned
// to the appropriate option. Otherwise, the first arg found is
// the sentence to look for, and all other args are paths in which
// to look for that sentence.
String needle = "";
String tagSeparator = "_";
String encoding = "utf-8";
String fileRegex = "";
List<String> paths = new ArrayList<>();
for (int i = 0; i < args.length; ++i) {
if ((args[i].equalsIgnoreCase("-tagSeparator") || args[i].equalsIgnoreCase("--tagSeparator")) && i + 1 < args.length) {
tagSeparator = args[i + 1];
++i;
} else if ((args[i].equalsIgnoreCase("-encoding") || args[i].equalsIgnoreCase("--encoding")) && i + 1 < args.length) {
encoding = args[i + 1];
++i;
} else if ((args[i].equalsIgnoreCase("-fileRegex") || args[i].equalsIgnoreCase("--fileRegex")) && i + 1 < args.length) {
fileRegex = args[i + 1];
++i;
} else if (needle.equals("")) {
needle = args[i].trim();
} else {
paths.add(args[i]);
}
}
TreeReaderFactory trf = new LabeledScoredTreeReaderFactory();
// If the user specified a regex, here we make a filter using that
// regex. We just use an anonymous class for the filter
FileFilter filter = null;
if (!fileRegex.equals("")) {
final Pattern filePattern = Pattern.compile(fileRegex);
filter = pathname -> (pathname.isDirectory() || filePattern.matcher(pathname.getName()).matches());
}
for (String path : paths) {
// Start a new treebank with the given path, encoding, filter, etc
DiskTreebank treebank = new DiskTreebank(trf, encoding);
treebank.loadPath(path, filter);
Iterator<Tree> treeIterator = treebank.iterator();
int treeCount = 0;
String currentFile = "";
while (treeIterator.hasNext()) {
// keep track of which file we are currently looking at
if (!currentFile.equals(treebank.getCurrentFilename())) {
currentFile = treebank.getCurrentFilename();
treeCount = 0;
}
++treeCount;
Tree tree = treeIterator.next();
List<TaggedWord> sentence = tree.taggedYield();
boolean found = false;
// The tree can match in one of three ways: tagged, untagged,
// or untagged and unsegmented (which is useful for Chinese,
// for example)
String haystack = SentenceUtils.listToString(sentence, true);
found = needle.equals(haystack);
haystack = haystack.replaceAll(" ", "");
found = found || needle.equals(haystack);
haystack = SentenceUtils.listToString(sentence, false, tagSeparator);
found = found || needle.equals(haystack);
if (found) {
System.out.println("needle found in " + currentFile + " tree " + treeCount);
}
}
}
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class ChineseCharacterBasedLexicon method finishTraining.
@Override
public void finishTraining() {
Timing.tick("Counting characters...");
ClassicCounter<Symbol> charCounter = new ClassicCounter<>();
// first find all chars that occur only once
for (List<TaggedWord> labels : trainingSentences) {
for (TaggedWord label : labels) {
String word = label.word();
if (word.equals(BOUNDARY)) {
continue;
}
for (int j = 0, length = word.length(); j < length; j++) {
Symbol sym = Symbol.cannonicalSymbol(word.charAt(j));
charCounter.incrementCount(sym);
}
charCounter.incrementCount(Symbol.END_WORD);
}
}
Set<Symbol> singletons = Counters.keysBelow(charCounter, 1.5);
knownChars = Generics.newHashSet(charCounter.keySet());
Timing.tick("Counting nGrams...");
GeneralizedCounter[] POSspecificCharNGrams = new GeneralizedCounter[CONTEXT_LENGTH + 1];
for (int i = 0; i <= CONTEXT_LENGTH; i++) {
POSspecificCharNGrams[i] = new GeneralizedCounter(i + 2);
}
ClassicCounter<String> POSCounter = new ClassicCounter<>();
List<Serializable> context = new ArrayList<>(CONTEXT_LENGTH + 1);
for (List<TaggedWord> words : trainingSentences) {
for (TaggedWord taggedWord : words) {
String word = taggedWord.word();
String tag = taggedWord.tag();
tagIndex.add(tag);
if (word.equals(BOUNDARY)) {
continue;
}
POSCounter.incrementCount(tag);
for (int i = 0, size = word.length(); i <= size; i++) {
Symbol sym;
Symbol unknownCharClass = null;
context.clear();
context.add(tag);
if (i < size) {
char thisCh = word.charAt(i);
sym = Symbol.cannonicalSymbol(thisCh);
if (singletons.contains(sym)) {
unknownCharClass = unknownCharClass(sym);
charCounter.incrementCount(unknownCharClass);
}
} else {
sym = Symbol.END_WORD;
}
// POS-specific 1-gram
POSspecificCharNGrams[0].incrementCount(context, sym);
if (unknownCharClass != null) {
// for unknown ch model
POSspecificCharNGrams[0].incrementCount(context, unknownCharClass);
}
// this could be made faster using .sublist like in score
for (int j = 1; j <= CONTEXT_LENGTH; j++) {
// poly grams
if (i - j < 0) {
context.add(Symbol.BEGIN_WORD);
POSspecificCharNGrams[j].incrementCount(context, sym);
if (unknownCharClass != null) {
// for unknown ch model
POSspecificCharNGrams[j].incrementCount(context, unknownCharClass);
}
break;
} else {
Symbol prev = Symbol.cannonicalSymbol(word.charAt(i - j));
if (singletons.contains(prev)) {
context.add(unknownCharClass(prev));
} else {
context.add(prev);
}
POSspecificCharNGrams[j].incrementCount(context, sym);
if (unknownCharClass != null) {
// for unknown ch model
POSspecificCharNGrams[j].incrementCount(context, unknownCharClass);
}
}
}
}
}
}
POSDistribution = Distribution.getDistribution(POSCounter);
Timing.tick("Creating character prior distribution...");
charDistributions = Generics.newHashMap();
// charDistributions = Generics.newHashMap(); // 1.5
// charCounter.incrementCount(Symbol.UNKNOWN, singletons.size());
int numberOfKeys = charCounter.size() + singletons.size();
Distribution<Symbol> prior = Distribution.goodTuringSmoothedCounter(charCounter, numberOfKeys);
charDistributions.put(Collections.EMPTY_LIST, prior);
for (int i = 0; i <= CONTEXT_LENGTH; i++) {
Set<Map.Entry<List<Serializable>, ClassicCounter<Symbol>>> counterEntries = POSspecificCharNGrams[i].lowestLevelCounterEntrySet();
Timing.tick("Creating " + counterEntries.size() + " character " + (i + 1) + "-gram distributions...");
for (Map.Entry<List<Serializable>, ClassicCounter<Symbol>> entry : counterEntries) {
context = entry.getKey();
ClassicCounter<Symbol> c = entry.getValue();
Distribution<Symbol> thisPrior = charDistributions.get(context.subList(0, context.size() - 1));
double priorWeight = thisPrior.getNumberOfKeys() / 200.0;
Distribution<Symbol> newDist = Distribution.dynamicCounterWithDirichletPrior(c, thisPrior, priorWeight);
charDistributions.put(context, newDist);
}
}
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class ChineseMaxentLexicon method train.
/**
* Add the given sentence to the statistics counted. Can
* be called multiple times with different sentences.
*/
@Override
public void train(List<TaggedWord> sentence, double weight) {
featExtractor.train(sentence, weight);
for (TaggedWord word : sentence) {
datumCounter.incrementCount(word, weight);
tagsForWord.add(word.word(), word.tag());
}
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class ChineseWordFeatureExtractor method train.
public void train(List<TaggedWord> sentence, double weight) {
for (TaggedWord word : sentence) {
String wordString = word.word();
wordCounter.incrementCount(wordString, weight);
}
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class BaseLexicon method trainUnannotated.
@Override
public final void trainUnannotated(List<TaggedWord> sentence, double weight) {
uwModelTrainer.incrementTreesRead(weight);
int loc = 0;
for (TaggedWord tw : sentence) {
String baseTag = op.langpack().basicCategory(tw.tag());
Counter<String> counts = baseTagCounts.get(baseTag);
if (counts == null) {
++loc;
continue;
}
double totalCount = counts.totalCount();
if (totalCount == 0) {
++loc;
continue;
}
for (String tag : counts.keySet()) {
TaggedWord newTW = new TaggedWord(tw.word(), tag);
train(newTW, loc, weight * counts.getCount(tag) / totalCount);
}
++loc;
}
}
Aggregations