use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class CountClosedTags method countTrainingTags.
/**
* Count trainingRatio of the sentences for both trainingWords and
* allWords, and count the rest for just allWords
*/
void countTrainingTags(TaggedFileRecord file) throws IOException {
int sentences = countSentences(file);
int trainSentences = (int) (sentences * trainingRatio);
TaggedFileReader reader = file.reader();
List<TaggedWord> line;
for (int i = 0; i < trainSentences && reader.hasNext(); ++i) {
line = reader.next();
addTaggedWords(line, trainingWords);
addTaggedWords(line, allWords);
}
while (reader.hasNext()) {
line = reader.next();
addTaggedWords(line, allWords);
}
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class MakePrefixFile method main.
public static void main(String[] args) {
Properties config = StringUtils.argsToProperties(args);
log.info(config);
boolean fullSentence = PropertiesUtils.getBool(config, "fullSentence", false);
Random random = new Random();
String tagSeparator = config.getProperty("tagSeparator", TaggerConfig.TAG_SEPARATOR);
TaggedFileRecord record = TaggedFileRecord.createRecord(config, config.getProperty("input"));
for (List<TaggedWord> sentence : record.reader()) {
int len = random.nextInt(sentence.size()) + 1;
System.out.println(SentenceUtils.listToString(sentence.subList(0, len), false, tagSeparator));
if (fullSentence) {
System.out.println(SentenceUtils.listToString(sentence, false, tagSeparator));
}
}
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class ReadDataTagged method loadFile.
private void loadFile(TaggedFileReader reader, Map<String, IntCounter<String>> wordTagCounts) {
log.info("Loading tagged words from " + reader.filename());
ArrayList<String> words = new ArrayList<>();
ArrayList<String> tags = new ArrayList<>();
int numSentences = 0;
int numWords = 0;
int maxLen = Integer.MIN_VALUE;
int minLen = Integer.MAX_VALUE;
for (List<TaggedWord> sentence : reader) {
if (maxentTagger.wordFunction != null) {
List<TaggedWord> newSentence = new ArrayList<>(sentence.size());
for (TaggedWord word : sentence) {
TaggedWord newWord = new TaggedWord(maxentTagger.wordFunction.apply(word.word()), word.tag());
newSentence.add(newWord);
}
sentence = newSentence;
}
for (TaggedWord tw : sentence) {
if (tw != null) {
words.add(tw.word());
tags.add(tw.tag());
if (!maxentTagger.tagTokens.containsKey(tw.tag())) {
maxentTagger.tagTokens.put(tw.tag(), Generics.<String>newHashSet());
}
maxentTagger.tagTokens.get(tw.tag()).add(tw.word());
}
}
maxLen = (sentence.size() > maxLen ? sentence.size() : maxLen);
minLen = (sentence.size() < minLen ? sentence.size() : minLen);
words.add(Tagger.EOS_WORD);
tags.add(Tagger.EOS_TAG);
numElements = numElements + sentence.size() + 1;
// iterate over the words in the sentence
for (int i = 0; i < sentence.size() + 1; i++) {
History h = new History(totalWords + totalSentences, totalWords + totalSentences + sentence.size(), totalWords + totalSentences + i, pairs, maxentTagger.extractors);
String tag = tags.get(i);
String word = words.get(i);
pairs.add(new WordTag(word, tag));
int y = maxentTagger.addTag(tag);
DataWordTag dat = new DataWordTag(h, y, tag);
v.add(dat);
IntCounter<String> tagCounts = wordTagCounts.get(word);
if (tagCounts == null) {
tagCounts = new IntCounter<>();
wordTagCounts.put(word, tagCounts);
}
tagCounts.incrementCount(tag, 1);
}
totalSentences++;
totalWords += sentence.size();
numSentences++;
numWords += sentence.size();
words.clear();
tags.clear();
if ((numSentences % 100000) == 0)
log.info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words ... [still reading]");
}
log.info("Read " + numWords + " words from " + reader.filename() + " [done].");
log.info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words.");
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class CollocationFinder method getStemmedWordTagsFromTree.
/**
*
* @param t a tree
* @return the WordTags corresponding to the leaves of the tree,
* stemmed according to their POS tags in the tree.
*/
private static List<WordTag> getStemmedWordTagsFromTree(Tree t) {
List<WordTag> stemmedWordTags = Generics.newArrayList();
ArrayList<TaggedWord> s = t.taggedYield();
for (TaggedWord w : s) {
WordTag wt = Morphology.stemStatic(w.word(), w.tag());
stemmedWordTags.add(wt);
}
return stemmedWordTags;
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class CollocationFinder method getNonStemmedWordTagsFromTree.
private static List<WordTag> getNonStemmedWordTagsFromTree(Tree t) {
List<WordTag> wordTags = Generics.newArrayList();
ArrayList<TaggedWord> s = t.taggedYield();
for (TaggedWord w : s) {
WordTag wt = new WordTag(w.word(), w.tag());
wordTags.add(wt);
}
return wordTags;
}
Aggregations