use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.
the class ApplyPatternsMulti method call.
@Override
public Pair<TwoDimensionalCounter<Pair<String, String>, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>> call() throws Exception {
//CollectionValuedMap<String, Integer> tokensMatchedPattern = new CollectionValuedMap<String, Integer>();
CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
TwoDimensionalCounter<Pair<String, String>, E> allFreq = new TwoDimensionalCounter<>();
for (String sentid : sentids) {
List<CoreLabel> sent = sents.get(sentid).getTokens();
//FIND_ALL is faster than FIND_NONOVERLAP
Iterable<SequenceMatchResult<CoreMap>> matched = multiPatternMatcher.find(sent, SequenceMatcher.FindType.FIND_ALL);
for (SequenceMatchResult<CoreMap> m : matched) {
int s = m.start("$term");
int e = m.end("$term");
E matchedPat = patterns.get(m.pattern());
matchedTokensByPat.add(matchedPat, new Triple<>(sentid, s, e));
String phrase = "";
String phraseLemma = "";
boolean useWordNotLabeled = false;
boolean doNotUse = false;
//find if the neighboring words are labeled - if so - club them together
if (constVars.clubNeighboringLabeledWords) {
for (int i = s - 1; i >= 0; i--) {
if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
s = i + 1;
break;
}
}
for (int i = e; i < sent.size(); i++) {
if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
e = i;
break;
}
}
}
//to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
boolean[] addedindices = new boolean[e - s];
Arrays.fill(addedindices, false);
for (int i = s; i < e; i++) {
CoreLabel l = sent.get(i);
l.set(PatternsAnnotations.MatchedPattern.class, true);
if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class))
l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
l.get(PatternsAnnotations.MatchedPatterns.class).add(matchedPat);
// }
for (Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
doNotUse = true;
}
}
boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
if (removePhrasesWithStopWords && containsStop) {
doNotUse = true;
} else {
if (!containsStop || !removeStopWordsFromSelectedPhrases) {
if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label.toString())) {
useWordNotLabeled = true;
}
phrase += " " + l.word();
phraseLemma += " " + l.lemma();
addedindices[i - s] = true;
}
}
}
for (int i = 0; i < addedindices.length; i++) {
if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
doNotUse = true;
break;
}
}
if (!doNotUse && useWordNotLabeled) {
phrase = phrase.trim();
phraseLemma = phraseLemma.trim();
allFreq.incrementCount(new Pair<>(phrase, phraseLemma), matchedPat, 1.0);
}
}
// for (SurfacePattern pat : patterns.keySet()) {
// String patternStr = pat.toString();
//
// TokenSequencePattern p = TokenSequencePattern.compile(constVars.env.get(label), patternStr);
// if (pat == null || p == null)
// throw new RuntimeException("why is the pattern " + pat + " null?");
//
// TokenSequenceMatcher m = p.getMatcher(sent);
// while (m.find()) {
//
// int s = m.start("$term");
// int e = m.end("$term");
//
// String phrase = "";
// String phraseLemma = "";
// boolean useWordNotLabeled = false;
// boolean doNotUse = false;
// for (int i = s; i < e; i++) {
// CoreLabel l = sent.get(i);
// l.set(PatternsAnnotations.MatchedPattern.class, true);
// if (restrictToMatched) {
// tokensMatchedPattern.add(sentid, i);
// }
// for (Entry<Class, Object> ig : constVars.ignoreWordswithClassesDuringSelection.get(label).entrySet()) {
// if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
// doNotUse = true;
// }
// }
// boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), constVars.ignoreWordRegex, ignoreWords);
// if (removePhrasesWithStopWords && containsStop) {
// doNotUse = true;
// } else {
// if (!containsStop || !removeStopWordsFromSelectedPhrases) {
//
// if (label == null || l.get(constVars.answerClass.get(label)) == null || !l.get(constVars.answerClass.get(label)).equals(label.toString())) {
// useWordNotLabeled = true;
// }
// phrase += " " + l.word();
// phraseLemma += " " + l.lemma();
//
// }
// }
// }
// if (!doNotUse && useWordNotLabeled) {
// phrase = phrase.trim();
// phraseLemma = phraseLemma.trim();
// allFreq.incrementCount(new Pair<String, String>(phrase, phraseLemma), pat, 1.0);
// }
// }
// }
}
return new Pair<>(allFreq, matchedTokensByPat);
}
use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.
the class Treebanks method countTaggings.
private static void countTaggings(Treebank tb, final PrintWriter pw) {
final TwoDimensionalCounter<String, String> wtc = new TwoDimensionalCounter<>();
tb.apply(tree -> {
List<TaggedWord> tags = tree.taggedYield();
for (TaggedWord tag : tags) wtc.incrementCount(tag.word(), tag.tag());
});
for (String key : wtc.firstKeySet()) {
pw.print(key);
pw.print('\t');
Counter<String> ctr = wtc.getCounter(key);
for (String k2 : ctr.keySet()) {
pw.print(k2 + '\t' + ctr.getCount(k2) + '\t');
}
pw.println();
}
}
use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.
the class SplitCanditoTrees method preprocessMWEs.
static void preprocessMWEs(Map<String, Tree> treeMap) {
TwoDimensionalCounter<String, String> labelTerm = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> termLabel = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> labelPreterm = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> pretermLabel = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> unigramTagger = new TwoDimensionalCounter<>();
for (Tree t : treeMap.values()) {
MWEPreprocessor.countMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel);
}
for (Tree t : treeMap.values()) {
MWEPreprocessor.traverseAndFix(t, pretermLabel, unigramTagger);
}
}
use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.
the class MultiWordPreprocessor method main.
/**
*
* @param args
*/
public static void main(String[] args) {
Properties options = StringUtils.argsToProperties(args, argOptionDefs);
if (!options.containsKey("") || options.containsKey("help")) {
log.info(usage());
return;
}
boolean retainNER = PropertiesUtils.getBool(options, "ner", false);
boolean normalize = PropertiesUtils.getBool(options, "normalize", true);
final File treeFile = new File(options.getProperty(""));
TwoDimensionalCounter<String, String> labelTerm = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> termLabel = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> labelPreterm = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> pretermLabel = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> unigramTagger = new TwoDimensionalCounter<>();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new SpanishTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
for (Tree t; (t = tr.readTree()) != null; ) {
updateTagger(unigramTagger, t);
}
//Closes the underlying reader
tr.close();
System.out.println("Resolving DUMMY tags");
resolveDummyTags(treeFile, unigramTagger, retainNER, normalize ? new SpanishTreeNormalizer(true, false, false) : null);
System.out.println("#Unknown Word Types: " + ManualUWModel.nUnknownWordTypes);
System.out.println(String.format("#Missing POS: %d (fixed: %d, %.2f%%)", nMissingPOS, nFixedPOS, (double) nFixedPOS / nMissingPOS * 100));
System.out.println(String.format("#Missing Phrasal: %d (fixed: %d, %.2f%%)", nMissingPhrasal, nFixedPhrasal, (double) nFixedPhrasal / nMissingPhrasal * 100));
System.out.println("Done!");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.
the class FTBDataset method preprocessMWEs.
/**
* Corrects MWE annotations that lack internal POS labels.
*/
private void preprocessMWEs() {
TwoDimensionalCounter<String, String> labelTerm = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> termLabel = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> labelPreterm = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> pretermLabel = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> unigramTagger = new TwoDimensionalCounter<>();
for (Tree t : treebank) {
MWEPreprocessor.countMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel);
}
for (Tree t : treebank) {
MWEPreprocessor.traverseAndFix(t, pretermLabel, unigramTagger);
}
}
Aggregations