use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.
the class ApplyDepPatterns method call.
@Override
public Pair<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>> call() throws Exception {
// CollectionValuedMap<String, Integer> tokensMatchedPattern = new
// CollectionValuedMap<String, Integer>();
TwoDimensionalCounter<CandidatePhrase, E> allFreq = new TwoDimensionalCounter<>();
CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
for (String sentid : sentids) {
DataInstance sent = sents.get(sentid);
List<CoreLabel> tokens = sent.getTokens();
for (Map.Entry<SemgrexPattern, E> pEn : patterns.entrySet()) {
if (pEn.getKey() == null)
throw new RuntimeException("why is the pattern " + pEn + " null?");
SemanticGraph graph = ((DataInstanceDep) sent).getGraph();
//SemgrexMatcher m = pEn.getKey().matcher(graph);
//TokenSequenceMatcher m = pEn.getKey().matcher(sent);
// //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
// m.setFindType(SequenceMatcher.FindType.FIND_ALL);
//Higher branch values makes the faster but uses more memory
//m.setBranchLimit(5);
Collection<ExtractedPhrase> matched = getMatchedTokensIndex(graph, pEn.getKey(), sent, label);
for (ExtractedPhrase match : matched) {
int s = match.startIndex;
int e = match.endIndex + 1;
String phrase = "";
String phraseLemma = "";
boolean useWordNotLabeled = false;
boolean doNotUse = false;
//find if the neighboring words are labeled - if so - club them together
if (constVars.clubNeighboringLabeledWords) {
for (int i = s - 1; i >= 0; i--) {
if (tokens.get(i).get(constVars.getAnswerClass().get(label)).equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped.get(label)) {
s = i;
//System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s);
} else
break;
}
for (int i = e; i < tokens.size(); i++) {
if (tokens.get(i).get(constVars.getAnswerClass().get(label)).equals(label) && (i - s + 1) <= PatternFactory.numWordsCompoundMapped.get(label)) {
e = i;
//System.out.println("for phrase " + match + " clubbing next word. new e is " + e);
} else
break;
}
}
//to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
boolean[] addedindices = new boolean[e - s];
Arrays.fill(addedindices, false);
for (int i = s; i < e; i++) {
CoreLabel l = tokens.get(i);
l.set(PatternsAnnotations.MatchedPattern.class, true);
if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class) || l.get(PatternsAnnotations.MatchedPatterns.class) == null)
l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
Pattern pSur = pEn.getValue();
assert pSur != null : "Why is " + pEn.getValue() + " not present in the index?!";
assert l.get(PatternsAnnotations.MatchedPatterns.class) != null : "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.keySet();
l.get(PatternsAnnotations.MatchedPatterns.class).add(pSur);
for (Map.Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
doNotUse = true;
}
}
boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
if (removePhrasesWithStopWords && containsStop) {
doNotUse = true;
} else {
if (!containsStop || !removeStopWordsFromSelectedPhrases) {
if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label.toString())) {
useWordNotLabeled = true;
}
phrase += " " + l.word();
phraseLemma += " " + l.lemma();
addedindices[i - s] = true;
}
}
}
for (int i = 0; i < addedindices.length; i++) {
if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
doNotUse = true;
break;
}
}
if (!doNotUse && useWordNotLabeled) {
matchedTokensByPat.add(pEn.getValue(), new Triple<>(sentid, s, e - 1));
if (useWordNotLabeled) {
phrase = phrase.trim();
phraseLemma = phraseLemma.trim();
allFreq.incrementCount(CandidatePhrase.createOrGet(phrase, phraseLemma, match.getFeatures()), pEn.getValue(), 1.0);
}
}
}
}
}
return new Pair<>(allFreq, matchedTokensByPat);
}
use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.
the class MWEFrequencyDist method main.
public static void main(String[] args) {
if (args.length != 1) {
System.err.printf("Usage: java %s file%n", MWEFrequencyDist.class.getName());
System.exit(-1);
}
final File treeFile = new File(args[0]);
TwoDimensionalCounter<String, String> mweLabelToString = new TwoDimensionalCounter<>();
Set<String> uniquePOSSequences = Generics.newHashSet();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new FrenchTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
final TregexPattern pMWE = TregexPattern.compile("/^MW/");
for (Tree t; (t = tr.readTree()) != null; ) {
//Count MWE statistics
TregexMatcher m = pMWE.matcher(t);
while (m.findNextMatchingNode()) {
Tree match = m.getMatch();
String label = match.value();
List<CoreLabel> yield = match.taggedLabeledYield();
StringBuilder termYield = new StringBuilder();
StringBuilder posYield = new StringBuilder();
for (CoreLabel cl : yield) {
termYield.append(cl.word()).append(" ");
posYield.append(cl.tag()).append(" ");
}
mweLabelToString.incrementCount(label, termYield.toString().trim());
uniquePOSSequences.add(posYield.toString().trim());
}
}
//Closes the underlying reader
tr.close();
System.out.printf("Type\t#Type\t#Single\t%%Single\t%%Total%n");
double nMWEs = mweLabelToString.totalCount();
int nAllSingletons = 0;
int nTokens = 0;
for (String mweLabel : mweLabelToString.firstKeySet()) {
int nSingletons = 0;
double totalCount = mweLabelToString.totalCount(mweLabel);
Counter<String> mc = mweLabelToString.getCounter(mweLabel);
for (String term : mc.keySet()) {
if (mc.getCount(term) == 1.0)
nSingletons++;
nTokens += term.split("\\s+").length * (int) mc.getCount(term);
}
nAllSingletons += nSingletons;
System.out.printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int) totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs);
}
System.out.printf("TOTAL:\t%d\t%d\t%.2f%n", (int) nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs);
System.out.println("#tokens = " + nTokens);
System.out.println("#unique MWE POS sequences = " + uniquePOSSequences.size());
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (TregexParseException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.
the class GetPatternsFromDataMultiClass method loadFromSavedPatternsWordsDir.
public static <E extends Pattern> Map<E, String> loadFromSavedPatternsWordsDir(GetPatternsFromDataMultiClass<E> model, Properties props) throws IOException, ClassNotFoundException {
boolean labelSentsUsingModel = Boolean.parseBoolean(props.getProperty("labelSentsUsingModel", "true"));
boolean applyPatsUsingModel = Boolean.parseBoolean(props.getProperty("applyPatsUsingModel", "true"));
int numIterationsOfSavedPatternsToLoad = Integer.parseInt(props.getProperty(Flags.numIterationsOfSavedPatternsToLoad, String.valueOf(Integer.MAX_VALUE)));
Map<E, String> labelsForPattterns = new HashMap<>();
String patternsWordsDirValue = props.getProperty(Flags.patternsWordsDir);
String patternsWordsDir;
// if(patternsWordsDirValue.endsWith(".zip")){
// File tempdir = File.createTempFile("patternswordsdir","dir");
// tempdir.deleteOnExit();
// tempdir.delete();
// tempdir.mkdirs();
// patternsWordsDir = tempdir.getAbsolutePath();
// unzip(patternsWordsDirValue, patternsWordsDir);
// }else
patternsWordsDir = patternsWordsDirValue;
String sentsOutFile = props.getProperty("sentsOutFile");
String loadModelForLabels = props.getProperty(Flags.loadModelForLabels);
List<String> loadModelForLabelsList = null;
if (loadModelForLabels != null)
loadModelForLabelsList = Arrays.asList(loadModelForLabels.split("[,;]"));
for (String label : model.constVars.getLabels()) {
if (loadModelForLabels != null && !loadModelForLabelsList.contains(label))
continue;
assert (new File(patternsWordsDir + "/" + label).exists()) : "Why does the directory " + patternsWordsDir + "/" + label + " not exist?";
readClassesInEnv(patternsWordsDir + "/env.txt", model.constVars.env, ConstantsAndVariables.globalEnv);
//Read the token mapping
if (model.constVars.patternType.equals(PatternFactory.PatternType.SURFACE))
Token.setClass2KeyMapping(new File(patternsWordsDir + "/tokenenv.txt"));
//Load Patterns
File patf = new File(patternsWordsDir + "/" + label + "/patternsEachIter.ser");
if (patf.exists()) {
Map<Integer, Counter<E>> patterns = IOUtils.readObjectFromFile(patf);
if (numIterationsOfSavedPatternsToLoad < Integer.MAX_VALUE) {
Set<Integer> toremove = new HashSet<>();
for (Integer i : patterns.keySet()) {
if (i >= numIterationsOfSavedPatternsToLoad) {
System.out.println("Removing patterns from iteration " + i);
toremove.add(i);
}
}
for (Integer i : toremove) patterns.remove(i);
}
Counter<E> pats = Counters.flatten(patterns);
for (E p : pats.keySet()) {
labelsForPattterns.put(p, label);
}
numIterationsLoadedModel = Math.max(numIterationsLoadedModel, patterns.size());
model.setLearnedPatterns(pats, label);
model.setLearnedPatternsEachIter(patterns, label);
Redwood.log(Redwood.DBG, "Loaded " + model.getLearnedPatterns().get(label).size() + " patterns from " + patf);
}
//Load Words
File wordf = new File(patternsWordsDir + "/" + label + "/phrases.txt");
if (wordf.exists()) {
TreeMap<Integer, Counter<CandidatePhrase>> words = GetPatternsFromDataMultiClass.readLearnedWordsFromFile(wordf);
model.constVars.setLearnedWordsEachIter(words, label);
if (numIterationsOfSavedPatternsToLoad < Integer.MAX_VALUE) {
Set<Integer> toremove = new HashSet<>();
for (Integer i : words.keySet()) {
if (i >= numIterationsOfSavedPatternsToLoad) {
System.out.println("Removing patterns from iteration " + i);
toremove.add(i);
}
}
for (Integer i : toremove) words.remove(i);
}
numIterationsLoadedModel = Math.max(numIterationsLoadedModel, words.size());
Redwood.log(Redwood.DBG, "Loaded " + words.size() + " phrases from " + wordf);
}
CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
Iterator<Pair<Map<String, DataInstance>, File>> sentsIter = new ConstantsAndVariables.DataSentsIterator(model.constVars.batchProcessSents);
TwoDimensionalCounter<CandidatePhrase, E> wordsandLemmaPatExtracted = new TwoDimensionalCounter<>();
Set<CandidatePhrase> alreadyLabeledWords = new HashSet<>();
while (sentsIter.hasNext()) {
Pair<Map<String, DataInstance>, File> sents = sentsIter.next();
if (labelSentsUsingModel) {
Redwood.log(Redwood.DBG, "labeling sentences from " + sents.second() + " with the already learned words");
assert sents.first() != null : "Why are sents null";
model.labelWords(label, sents.first(), model.constVars.getLearnedWords(label).keySet(), sentsOutFile, matchedTokensByPat);
if (sents.second().exists())
IOUtils.writeObjectToFile(sents, sents.second());
}
if (model.constVars.restrictToMatched || applyPatsUsingModel) {
Redwood.log(Redwood.DBG, "Applying patterns to " + sents.first().size() + " sentences");
model.constVars.invertedIndex.add(sents.first(), true);
model.constVars.invertedIndex.add(sents.first(), true);
model.scorePhrases.applyPats(model.getLearnedPatterns(label), label, wordsandLemmaPatExtracted, matchedTokensByPat, alreadyLabeledWords);
}
}
Counters.addInPlace(model.wordsPatExtracted.get(label), wordsandLemmaPatExtracted);
System.out.println("All Extracted phrases are " + wordsandLemmaPatExtracted.firstKeySet());
}
System.out.flush();
System.err.flush();
return labelsForPattterns;
}
use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.
the class ApplyPatterns method call.
@Override
public Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>> call() throws Exception {
// CollectionValuedMap<String, Integer>();
try {
Set<CandidatePhrase> alreadyLabeledPhrases = new HashSet<>();
TwoDimensionalCounter<CandidatePhrase, E> allFreq = new TwoDimensionalCounter<>();
CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
for (String sentid : sentids) {
List<CoreLabel> sent = sents.get(sentid).getTokens();
for (Entry<TokenSequencePattern, E> pEn : patterns.entrySet()) {
if (pEn.getKey() == null)
throw new RuntimeException("why is the pattern " + pEn + " null?");
TokenSequenceMatcher m = pEn.getKey().getMatcher(sent);
// //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
// m.setFindType(SequenceMatcher.FindType.FIND_ALL);
//Higher branch values makes the faster but uses more memory
m.setBranchLimit(5);
while (m.find()) {
int s = m.start("$term");
int e = m.end("$term");
assert e - s <= PatternFactory.numWordsCompoundMapped.get(label) : "How come the pattern " + pEn.getKey() + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped.get(label) + " for label " + label;
String phrase = "";
String phraseLemma = "";
boolean useWordNotLabeled = false;
boolean doNotUse = false;
//find if the neighboring words are labeled - if so - club them together
if (constVars.clubNeighboringLabeledWords) {
for (int i = s - 1; i >= 0; i--) {
if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
s = i + 1;
break;
}
}
for (int i = e; i < sent.size(); i++) {
if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
e = i;
break;
}
}
}
//to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
boolean[] addedindices = new boolean[e - s];
Arrays.fill(addedindices, false);
for (int i = s; i < e; i++) {
CoreLabel l = sent.get(i);
l.set(PatternsAnnotations.MatchedPattern.class, true);
if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class) || l.get(PatternsAnnotations.MatchedPatterns.class) == null)
l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
SurfacePattern pSur = (SurfacePattern) pEn.getValue();
assert pSur != null : "Why is " + pEn.getValue() + " not present in the index?!";
assert l.get(PatternsAnnotations.MatchedPatterns.class) != null : "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.keySet();
l.get(PatternsAnnotations.MatchedPatterns.class).add(pSur);
for (Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
doNotUse = true;
}
}
boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
if (removePhrasesWithStopWords && containsStop) {
doNotUse = true;
} else {
if (!containsStop || !removeStopWordsFromSelectedPhrases) {
if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label.toString())) {
useWordNotLabeled = true;
}
phrase += " " + l.word();
phraseLemma += " " + l.lemma();
addedindices[i - s] = true;
}
}
}
for (int i = 0; i < addedindices.length; i++) {
if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
doNotUse = true;
break;
}
}
if (!doNotUse) {
matchedTokensByPat.add(pEn.getValue(), new Triple<>(sentid, s, e - 1));
phrase = phrase.trim();
if (!phrase.isEmpty()) {
phraseLemma = phraseLemma.trim();
CandidatePhrase candPhrase = CandidatePhrase.createOrGet(phrase, phraseLemma);
allFreq.incrementCount(candPhrase, pEn.getValue(), 1.0);
if (!useWordNotLabeled)
alreadyLabeledPhrases.add(candPhrase);
}
}
}
}
}
return new Triple<>(allFreq, matchedTokensByPat, alreadyLabeledPhrases);
} catch (Exception e) {
e.printStackTrace();
throw e;
}
}
Aggregations