Search in sources :

Example 1 with TwoDimensionalCounter

use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.

the class ApplyPatternsMulti method call.

@Override
public Pair<TwoDimensionalCounter<Pair<String, String>, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>> call() throws Exception {
    //CollectionValuedMap<String, Integer> tokensMatchedPattern = new CollectionValuedMap<String, Integer>();
    CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
    TwoDimensionalCounter<Pair<String, String>, E> allFreq = new TwoDimensionalCounter<>();
    for (String sentid : sentids) {
        List<CoreLabel> sent = sents.get(sentid).getTokens();
        //FIND_ALL is faster than FIND_NONOVERLAP
        Iterable<SequenceMatchResult<CoreMap>> matched = multiPatternMatcher.find(sent, SequenceMatcher.FindType.FIND_ALL);
        for (SequenceMatchResult<CoreMap> m : matched) {
            int s = m.start("$term");
            int e = m.end("$term");
            E matchedPat = patterns.get(m.pattern());
            matchedTokensByPat.add(matchedPat, new Triple<>(sentid, s, e));
            String phrase = "";
            String phraseLemma = "";
            boolean useWordNotLabeled = false;
            boolean doNotUse = false;
            //find if the neighboring words are labeled - if so - club them together
            if (constVars.clubNeighboringLabeledWords) {
                for (int i = s - 1; i >= 0; i--) {
                    if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
                        s = i + 1;
                        break;
                    }
                }
                for (int i = e; i < sent.size(); i++) {
                    if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
                        e = i;
                        break;
                    }
                }
            }
            //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
            boolean[] addedindices = new boolean[e - s];
            Arrays.fill(addedindices, false);
            for (int i = s; i < e; i++) {
                CoreLabel l = sent.get(i);
                l.set(PatternsAnnotations.MatchedPattern.class, true);
                if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class))
                    l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
                l.get(PatternsAnnotations.MatchedPatterns.class).add(matchedPat);
                // }
                for (Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
                    if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
                        doNotUse = true;
                    }
                }
                boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
                if (removePhrasesWithStopWords && containsStop) {
                    doNotUse = true;
                } else {
                    if (!containsStop || !removeStopWordsFromSelectedPhrases) {
                        if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label.toString())) {
                            useWordNotLabeled = true;
                        }
                        phrase += " " + l.word();
                        phraseLemma += " " + l.lemma();
                        addedindices[i - s] = true;
                    }
                }
            }
            for (int i = 0; i < addedindices.length; i++) {
                if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
                    doNotUse = true;
                    break;
                }
            }
            if (!doNotUse && useWordNotLabeled) {
                phrase = phrase.trim();
                phraseLemma = phraseLemma.trim();
                allFreq.incrementCount(new Pair<>(phrase, phraseLemma), matchedPat, 1.0);
            }
        }
    //      for (SurfacePattern pat : patterns.keySet()) {
    //        String patternStr = pat.toString();
    //
    //        TokenSequencePattern p = TokenSequencePattern.compile(constVars.env.get(label), patternStr);
    //        if (pat == null || p == null)
    //          throw new RuntimeException("why is the pattern " + pat + " null?");
    //
    //        TokenSequenceMatcher m = p.getMatcher(sent);
    //        while (m.find()) {
    //
    //          int s = m.start("$term");
    //          int e = m.end("$term");
    //
    //          String phrase = "";
    //          String phraseLemma = "";
    //          boolean useWordNotLabeled = false;
    //          boolean doNotUse = false;
    //          for (int i = s; i < e; i++) {
    //            CoreLabel l = sent.get(i);
    //            l.set(PatternsAnnotations.MatchedPattern.class, true);
    //            if (restrictToMatched) {
    //              tokensMatchedPattern.add(sentid, i);
    //            }
    //            for (Entry<Class, Object> ig : constVars.ignoreWordswithClassesDuringSelection.get(label).entrySet()) {
    //              if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
    //                doNotUse = true;
    //              }
    //            }
    //            boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), constVars.ignoreWordRegex, ignoreWords);
    //            if (removePhrasesWithStopWords && containsStop) {
    //              doNotUse = true;
    //            } else {
    //              if (!containsStop || !removeStopWordsFromSelectedPhrases) {
    //
    //                if (label == null || l.get(constVars.answerClass.get(label)) == null || !l.get(constVars.answerClass.get(label)).equals(label.toString())) {
    //                  useWordNotLabeled = true;
    //                }
    //                phrase += " " + l.word();
    //                phraseLemma += " " + l.lemma();
    //
    //              }
    //            }
    //          }
    //          if (!doNotUse && useWordNotLabeled) {
    //            phrase = phrase.trim();
    //            phraseLemma = phraseLemma.trim();
    //            allFreq.incrementCount(new Pair<String, String>(phrase, phraseLemma), pat, 1.0);
    //          }
    //        }
    //      }
    }
    return new Pair<>(allFreq, matchedTokensByPat);
}
Also used : CollectionValuedMap(edu.stanford.nlp.util.CollectionValuedMap) SequenceMatchResult(edu.stanford.nlp.ling.tokensregex.SequenceMatchResult) Pair(edu.stanford.nlp.util.Pair) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) Triple(edu.stanford.nlp.util.Triple) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 2 with TwoDimensionalCounter

use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.

the class Treebanks method countTaggings.

private static void countTaggings(Treebank tb, final PrintWriter pw) {
    final TwoDimensionalCounter<String, String> wtc = new TwoDimensionalCounter<>();
    tb.apply(tree -> {
        List<TaggedWord> tags = tree.taggedYield();
        for (TaggedWord tag : tags) wtc.incrementCount(tag.word(), tag.tag());
    });
    for (String key : wtc.firstKeySet()) {
        pw.print(key);
        pw.print('\t');
        Counter<String> ctr = wtc.getCounter(key);
        for (String k2 : ctr.keySet()) {
            pw.print(k2 + '\t' + ctr.getCount(k2) + '\t');
        }
        pw.println();
    }
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter)

Example 3 with TwoDimensionalCounter

use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.

the class SplitCanditoTrees method preprocessMWEs.

static void preprocessMWEs(Map<String, Tree> treeMap) {
    TwoDimensionalCounter<String, String> labelTerm = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> termLabel = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> labelPreterm = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> pretermLabel = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> unigramTagger = new TwoDimensionalCounter<>();
    for (Tree t : treeMap.values()) {
        MWEPreprocessor.countMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel);
    }
    for (Tree t : treeMap.values()) {
        MWEPreprocessor.traverseAndFix(t, pretermLabel, unigramTagger);
    }
}
Also used : TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) Tree(edu.stanford.nlp.trees.Tree)

Example 4 with TwoDimensionalCounter

use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.

the class MultiWordPreprocessor method main.

/**
   *
   * @param args
   */
public static void main(String[] args) {
    Properties options = StringUtils.argsToProperties(args, argOptionDefs);
    if (!options.containsKey("") || options.containsKey("help")) {
        log.info(usage());
        return;
    }
    boolean retainNER = PropertiesUtils.getBool(options, "ner", false);
    boolean normalize = PropertiesUtils.getBool(options, "normalize", true);
    final File treeFile = new File(options.getProperty(""));
    TwoDimensionalCounter<String, String> labelTerm = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> termLabel = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> labelPreterm = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> pretermLabel = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> unigramTagger = new TwoDimensionalCounter<>();
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
        TreeReaderFactory trf = new SpanishTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        for (Tree t; (t = tr.readTree()) != null; ) {
            updateTagger(unigramTagger, t);
        }
        //Closes the underlying reader
        tr.close();
        System.out.println("Resolving DUMMY tags");
        resolveDummyTags(treeFile, unigramTagger, retainNER, normalize ? new SpanishTreeNormalizer(true, false, false) : null);
        System.out.println("#Unknown Word Types: " + ManualUWModel.nUnknownWordTypes);
        System.out.println(String.format("#Missing POS: %d (fixed: %d, %.2f%%)", nMissingPOS, nFixedPOS, (double) nFixedPOS / nMissingPOS * 100));
        System.out.println(String.format("#Missing Phrasal: %d (fixed: %d, %.2f%%)", nMissingPhrasal, nFixedPhrasal, (double) nFixedPhrasal / nMissingPhrasal * 100));
        System.out.println("Done!");
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) TreeReader(edu.stanford.nlp.trees.TreeReader) SpanishTreeReaderFactory(edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) SpanishTreeReaderFactory(edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) SpanishTreeNormalizer(edu.stanford.nlp.trees.international.spanish.SpanishTreeNormalizer)

Example 5 with TwoDimensionalCounter

use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.

the class FTBDataset method preprocessMWEs.

/**
   * Corrects MWE annotations that lack internal POS labels.
   */
private void preprocessMWEs() {
    TwoDimensionalCounter<String, String> labelTerm = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> termLabel = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> labelPreterm = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> pretermLabel = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> unigramTagger = new TwoDimensionalCounter<>();
    for (Tree t : treebank) {
        MWEPreprocessor.countMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel);
    }
    for (Tree t : treebank) {
        MWEPreprocessor.traverseAndFix(t, pretermLabel, unigramTagger);
    }
}
Also used : TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) Tree(edu.stanford.nlp.trees.Tree)

Aggregations

TwoDimensionalCounter (edu.stanford.nlp.stats.TwoDimensionalCounter)14 Tree (edu.stanford.nlp.trees.Tree)6 CoreLabel (edu.stanford.nlp.ling.CoreLabel)5 TokenSequencePattern (edu.stanford.nlp.ling.tokensregex.TokenSequencePattern)5 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)4 Counter (edu.stanford.nlp.stats.Counter)3 TreeReader (edu.stanford.nlp.trees.TreeReader)3 TreeReaderFactory (edu.stanford.nlp.trees.TreeReaderFactory)3 Env (edu.stanford.nlp.ling.tokensregex.Env)2 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)2 SemgrexPattern (edu.stanford.nlp.semgraph.semgrex.SemgrexPattern)2 FrenchTreeReaderFactory (edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory)2 CollectionValuedMap (edu.stanford.nlp.util.CollectionValuedMap)2 Constructor (java.lang.reflect.Constructor)2 InvocationTargetException (java.lang.reflect.InvocationTargetException)2 Entry (java.util.Map.Entry)2 IOUtils (edu.stanford.nlp.io.IOUtils)1 RegExFileFilter (edu.stanford.nlp.io.RegExFileFilter)1 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)1 GoldAnswerAnnotation (edu.stanford.nlp.ling.CoreAnnotations.GoldAnswerAnnotation)1