Search in sources :

Example 11 with TwoDimensionalCounter

use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.

the class ApplyDepPatterns method call.

@Override
public Pair<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>> call() throws Exception {
    // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
    // CollectionValuedMap<String, Integer>();
    TwoDimensionalCounter<CandidatePhrase, E> allFreq = new TwoDimensionalCounter<>();
    CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
    for (String sentid : sentids) {
        DataInstance sent = sents.get(sentid);
        List<CoreLabel> tokens = sent.getTokens();
        for (Map.Entry<SemgrexPattern, E> pEn : patterns.entrySet()) {
            if (pEn.getKey() == null)
                throw new RuntimeException("why is the pattern " + pEn + " null?");
            SemanticGraph graph = ((DataInstanceDep) sent).getGraph();
            //SemgrexMatcher m = pEn.getKey().matcher(graph);
            //TokenSequenceMatcher m = pEn.getKey().matcher(sent);
            //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
            //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
            //Higher branch values makes the faster but uses more memory
            //m.setBranchLimit(5);
            Collection<ExtractedPhrase> matched = getMatchedTokensIndex(graph, pEn.getKey(), sent, label);
            for (ExtractedPhrase match : matched) {
                int s = match.startIndex;
                int e = match.endIndex + 1;
                String phrase = "";
                String phraseLemma = "";
                boolean useWordNotLabeled = false;
                boolean doNotUse = false;
                //find if the neighboring words are labeled - if so - club them together
                if (constVars.clubNeighboringLabeledWords) {
                    for (int i = s - 1; i >= 0; i--) {
                        if (tokens.get(i).get(constVars.getAnswerClass().get(label)).equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped.get(label)) {
                            s = i;
                        //System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s);
                        } else
                            break;
                    }
                    for (int i = e; i < tokens.size(); i++) {
                        if (tokens.get(i).get(constVars.getAnswerClass().get(label)).equals(label) && (i - s + 1) <= PatternFactory.numWordsCompoundMapped.get(label)) {
                            e = i;
                        //System.out.println("for phrase " + match + " clubbing next word. new e is " + e);
                        } else
                            break;
                    }
                }
                //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                boolean[] addedindices = new boolean[e - s];
                Arrays.fill(addedindices, false);
                for (int i = s; i < e; i++) {
                    CoreLabel l = tokens.get(i);
                    l.set(PatternsAnnotations.MatchedPattern.class, true);
                    if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class) || l.get(PatternsAnnotations.MatchedPatterns.class) == null)
                        l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
                    Pattern pSur = pEn.getValue();
                    assert pSur != null : "Why is " + pEn.getValue() + " not present in the index?!";
                    assert l.get(PatternsAnnotations.MatchedPatterns.class) != null : "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.keySet();
                    l.get(PatternsAnnotations.MatchedPatterns.class).add(pSur);
                    for (Map.Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
                        if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
                            doNotUse = true;
                        }
                    }
                    boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
                    if (removePhrasesWithStopWords && containsStop) {
                        doNotUse = true;
                    } else {
                        if (!containsStop || !removeStopWordsFromSelectedPhrases) {
                            if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label.toString())) {
                                useWordNotLabeled = true;
                            }
                            phrase += " " + l.word();
                            phraseLemma += " " + l.lemma();
                            addedindices[i - s] = true;
                        }
                    }
                }
                for (int i = 0; i < addedindices.length; i++) {
                    if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
                        doNotUse = true;
                        break;
                    }
                }
                if (!doNotUse && useWordNotLabeled) {
                    matchedTokensByPat.add(pEn.getValue(), new Triple<>(sentid, s, e - 1));
                    if (useWordNotLabeled) {
                        phrase = phrase.trim();
                        phraseLemma = phraseLemma.trim();
                        allFreq.incrementCount(CandidatePhrase.createOrGet(phrase, phraseLemma, match.getFeatures()), pEn.getValue(), 1.0);
                    }
                }
            }
        }
    }
    return new Pair<>(allFreq, matchedTokensByPat);
}
Also used : SurfacePattern(edu.stanford.nlp.patterns.surface.SurfacePattern) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) TokenSequencePattern(edu.stanford.nlp.ling.tokensregex.TokenSequencePattern) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) CoreLabel(edu.stanford.nlp.ling.CoreLabel) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph)

Example 12 with TwoDimensionalCounter

use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.

the class MWEFrequencyDist method main.

public static void main(String[] args) {
    if (args.length != 1) {
        System.err.printf("Usage: java %s file%n", MWEFrequencyDist.class.getName());
        System.exit(-1);
    }
    final File treeFile = new File(args[0]);
    TwoDimensionalCounter<String, String> mweLabelToString = new TwoDimensionalCounter<>();
    Set<String> uniquePOSSequences = Generics.newHashSet();
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
        TreeReaderFactory trf = new FrenchTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        final TregexPattern pMWE = TregexPattern.compile("/^MW/");
        for (Tree t; (t = tr.readTree()) != null; ) {
            //Count MWE statistics
            TregexMatcher m = pMWE.matcher(t);
            while (m.findNextMatchingNode()) {
                Tree match = m.getMatch();
                String label = match.value();
                List<CoreLabel> yield = match.taggedLabeledYield();
                StringBuilder termYield = new StringBuilder();
                StringBuilder posYield = new StringBuilder();
                for (CoreLabel cl : yield) {
                    termYield.append(cl.word()).append(" ");
                    posYield.append(cl.tag()).append(" ");
                }
                mweLabelToString.incrementCount(label, termYield.toString().trim());
                uniquePOSSequences.add(posYield.toString().trim());
            }
        }
        //Closes the underlying reader
        tr.close();
        System.out.printf("Type\t#Type\t#Single\t%%Single\t%%Total%n");
        double nMWEs = mweLabelToString.totalCount();
        int nAllSingletons = 0;
        int nTokens = 0;
        for (String mweLabel : mweLabelToString.firstKeySet()) {
            int nSingletons = 0;
            double totalCount = mweLabelToString.totalCount(mweLabel);
            Counter<String> mc = mweLabelToString.getCounter(mweLabel);
            for (String term : mc.keySet()) {
                if (mc.getCount(term) == 1.0)
                    nSingletons++;
                nTokens += term.split("\\s+").length * (int) mc.getCount(term);
            }
            nAllSingletons += nSingletons;
            System.out.printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int) totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs);
        }
        System.out.printf("TOTAL:\t%d\t%d\t%.2f%n", (int) nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs);
        System.out.println("#tokens = " + nTokens);
        System.out.println("#unique MWE POS sequences = " + uniquePOSSequences.size());
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (TregexParseException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) TreeReader(edu.stanford.nlp.trees.TreeReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) TregexParseException(edu.stanford.nlp.trees.tregex.TregexParseException) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) InputStreamReader(java.io.InputStreamReader) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CoreLabel(edu.stanford.nlp.ling.CoreLabel) BufferedReader(java.io.BufferedReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) File(java.io.File)

Example 13 with TwoDimensionalCounter

use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.

the class GetPatternsFromDataMultiClass method loadFromSavedPatternsWordsDir.

public static <E extends Pattern> Map<E, String> loadFromSavedPatternsWordsDir(GetPatternsFromDataMultiClass<E> model, Properties props) throws IOException, ClassNotFoundException {
    boolean labelSentsUsingModel = Boolean.parseBoolean(props.getProperty("labelSentsUsingModel", "true"));
    boolean applyPatsUsingModel = Boolean.parseBoolean(props.getProperty("applyPatsUsingModel", "true"));
    int numIterationsOfSavedPatternsToLoad = Integer.parseInt(props.getProperty(Flags.numIterationsOfSavedPatternsToLoad, String.valueOf(Integer.MAX_VALUE)));
    Map<E, String> labelsForPattterns = new HashMap<>();
    String patternsWordsDirValue = props.getProperty(Flags.patternsWordsDir);
    String patternsWordsDir;
    //    if(patternsWordsDirValue.endsWith(".zip")){
    //      File tempdir = File.createTempFile("patternswordsdir","dir");
    //      tempdir.deleteOnExit();
    //      tempdir.delete();
    //      tempdir.mkdirs();
    //      patternsWordsDir = tempdir.getAbsolutePath();
    //      unzip(patternsWordsDirValue, patternsWordsDir);
    //    }else
    patternsWordsDir = patternsWordsDirValue;
    String sentsOutFile = props.getProperty("sentsOutFile");
    String loadModelForLabels = props.getProperty(Flags.loadModelForLabels);
    List<String> loadModelForLabelsList = null;
    if (loadModelForLabels != null)
        loadModelForLabelsList = Arrays.asList(loadModelForLabels.split("[,;]"));
    for (String label : model.constVars.getLabels()) {
        if (loadModelForLabels != null && !loadModelForLabelsList.contains(label))
            continue;
        assert (new File(patternsWordsDir + "/" + label).exists()) : "Why does the directory " + patternsWordsDir + "/" + label + " not exist?";
        readClassesInEnv(patternsWordsDir + "/env.txt", model.constVars.env, ConstantsAndVariables.globalEnv);
        //Read the token mapping
        if (model.constVars.patternType.equals(PatternFactory.PatternType.SURFACE))
            Token.setClass2KeyMapping(new File(patternsWordsDir + "/tokenenv.txt"));
        //Load Patterns
        File patf = new File(patternsWordsDir + "/" + label + "/patternsEachIter.ser");
        if (patf.exists()) {
            Map<Integer, Counter<E>> patterns = IOUtils.readObjectFromFile(patf);
            if (numIterationsOfSavedPatternsToLoad < Integer.MAX_VALUE) {
                Set<Integer> toremove = new HashSet<>();
                for (Integer i : patterns.keySet()) {
                    if (i >= numIterationsOfSavedPatternsToLoad) {
                        System.out.println("Removing patterns from iteration " + i);
                        toremove.add(i);
                    }
                }
                for (Integer i : toremove) patterns.remove(i);
            }
            Counter<E> pats = Counters.flatten(patterns);
            for (E p : pats.keySet()) {
                labelsForPattterns.put(p, label);
            }
            numIterationsLoadedModel = Math.max(numIterationsLoadedModel, patterns.size());
            model.setLearnedPatterns(pats, label);
            model.setLearnedPatternsEachIter(patterns, label);
            Redwood.log(Redwood.DBG, "Loaded " + model.getLearnedPatterns().get(label).size() + " patterns from " + patf);
        }
        //Load Words
        File wordf = new File(patternsWordsDir + "/" + label + "/phrases.txt");
        if (wordf.exists()) {
            TreeMap<Integer, Counter<CandidatePhrase>> words = GetPatternsFromDataMultiClass.readLearnedWordsFromFile(wordf);
            model.constVars.setLearnedWordsEachIter(words, label);
            if (numIterationsOfSavedPatternsToLoad < Integer.MAX_VALUE) {
                Set<Integer> toremove = new HashSet<>();
                for (Integer i : words.keySet()) {
                    if (i >= numIterationsOfSavedPatternsToLoad) {
                        System.out.println("Removing patterns from iteration " + i);
                        toremove.add(i);
                    }
                }
                for (Integer i : toremove) words.remove(i);
            }
            numIterationsLoadedModel = Math.max(numIterationsLoadedModel, words.size());
            Redwood.log(Redwood.DBG, "Loaded " + words.size() + " phrases from " + wordf);
        }
        CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
        Iterator<Pair<Map<String, DataInstance>, File>> sentsIter = new ConstantsAndVariables.DataSentsIterator(model.constVars.batchProcessSents);
        TwoDimensionalCounter<CandidatePhrase, E> wordsandLemmaPatExtracted = new TwoDimensionalCounter<>();
        Set<CandidatePhrase> alreadyLabeledWords = new HashSet<>();
        while (sentsIter.hasNext()) {
            Pair<Map<String, DataInstance>, File> sents = sentsIter.next();
            if (labelSentsUsingModel) {
                Redwood.log(Redwood.DBG, "labeling sentences from " + sents.second() + " with the already learned words");
                assert sents.first() != null : "Why are sents null";
                model.labelWords(label, sents.first(), model.constVars.getLearnedWords(label).keySet(), sentsOutFile, matchedTokensByPat);
                if (sents.second().exists())
                    IOUtils.writeObjectToFile(sents, sents.second());
            }
            if (model.constVars.restrictToMatched || applyPatsUsingModel) {
                Redwood.log(Redwood.DBG, "Applying patterns to " + sents.first().size() + " sentences");
                model.constVars.invertedIndex.add(sents.first(), true);
                model.constVars.invertedIndex.add(sents.first(), true);
                model.scorePhrases.applyPats(model.getLearnedPatterns(label), label, wordsandLemmaPatExtracted, matchedTokensByPat, alreadyLabeledWords);
            }
        }
        Counters.addInPlace(model.wordsPatExtracted.get(label), wordsandLemmaPatExtracted);
        System.out.println("All Extracted phrases are " + wordsandLemmaPatExtracted.firstKeySet());
    }
    System.out.flush();
    System.err.flush();
    return labelsForPattterns;
}
Also used : Counter(edu.stanford.nlp.stats.Counter) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) AtomicInteger(java.util.concurrent.atomic.AtomicInteger)

Example 14 with TwoDimensionalCounter

use of edu.stanford.nlp.stats.TwoDimensionalCounter in project CoreNLP by stanfordnlp.

the class ApplyPatterns method call.

@Override
public Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>> call() throws Exception {
    // CollectionValuedMap<String, Integer>();
    try {
        Set<CandidatePhrase> alreadyLabeledPhrases = new HashSet<>();
        TwoDimensionalCounter<CandidatePhrase, E> allFreq = new TwoDimensionalCounter<>();
        CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
        for (String sentid : sentids) {
            List<CoreLabel> sent = sents.get(sentid).getTokens();
            for (Entry<TokenSequencePattern, E> pEn : patterns.entrySet()) {
                if (pEn.getKey() == null)
                    throw new RuntimeException("why is the pattern " + pEn + " null?");
                TokenSequenceMatcher m = pEn.getKey().getMatcher(sent);
                //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                //Higher branch values makes the faster but uses more memory
                m.setBranchLimit(5);
                while (m.find()) {
                    int s = m.start("$term");
                    int e = m.end("$term");
                    assert e - s <= PatternFactory.numWordsCompoundMapped.get(label) : "How come the pattern " + pEn.getKey() + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped.get(label) + " for label " + label;
                    String phrase = "";
                    String phraseLemma = "";
                    boolean useWordNotLabeled = false;
                    boolean doNotUse = false;
                    //find if the neighboring words are labeled - if so - club them together
                    if (constVars.clubNeighboringLabeledWords) {
                        for (int i = s - 1; i >= 0; i--) {
                            if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
                                s = i + 1;
                                break;
                            }
                        }
                        for (int i = e; i < sent.size(); i++) {
                            if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
                                e = i;
                                break;
                            }
                        }
                    }
                    //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                    boolean[] addedindices = new boolean[e - s];
                    Arrays.fill(addedindices, false);
                    for (int i = s; i < e; i++) {
                        CoreLabel l = sent.get(i);
                        l.set(PatternsAnnotations.MatchedPattern.class, true);
                        if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class) || l.get(PatternsAnnotations.MatchedPatterns.class) == null)
                            l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
                        SurfacePattern pSur = (SurfacePattern) pEn.getValue();
                        assert pSur != null : "Why is " + pEn.getValue() + " not present in the index?!";
                        assert l.get(PatternsAnnotations.MatchedPatterns.class) != null : "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.keySet();
                        l.get(PatternsAnnotations.MatchedPatterns.class).add(pSur);
                        for (Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
                            if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
                                doNotUse = true;
                            }
                        }
                        boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
                        if (removePhrasesWithStopWords && containsStop) {
                            doNotUse = true;
                        } else {
                            if (!containsStop || !removeStopWordsFromSelectedPhrases) {
                                if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label.toString())) {
                                    useWordNotLabeled = true;
                                }
                                phrase += " " + l.word();
                                phraseLemma += " " + l.lemma();
                                addedindices[i - s] = true;
                            }
                        }
                    }
                    for (int i = 0; i < addedindices.length; i++) {
                        if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
                            doNotUse = true;
                            break;
                        }
                    }
                    if (!doNotUse) {
                        matchedTokensByPat.add(pEn.getValue(), new Triple<>(sentid, s, e - 1));
                        phrase = phrase.trim();
                        if (!phrase.isEmpty()) {
                            phraseLemma = phraseLemma.trim();
                            CandidatePhrase candPhrase = CandidatePhrase.createOrGet(phrase, phraseLemma);
                            allFreq.incrementCount(candPhrase, pEn.getValue(), 1.0);
                            if (!useWordNotLabeled)
                                alreadyLabeledPhrases.add(candPhrase);
                        }
                    }
                }
            }
        }
        return new Triple<>(allFreq, matchedTokensByPat, alreadyLabeledPhrases);
    } catch (Exception e) {
        e.printStackTrace();
        throw e;
    }
}
Also used : CollectionValuedMap(edu.stanford.nlp.util.CollectionValuedMap) TokenSequencePattern(edu.stanford.nlp.ling.tokensregex.TokenSequencePattern) TokenSequenceMatcher(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) Triple(edu.stanford.nlp.util.Triple) CoreLabel(edu.stanford.nlp.ling.CoreLabel)

Aggregations

TwoDimensionalCounter (edu.stanford.nlp.stats.TwoDimensionalCounter)14 Tree (edu.stanford.nlp.trees.Tree)6 CoreLabel (edu.stanford.nlp.ling.CoreLabel)5 TokenSequencePattern (edu.stanford.nlp.ling.tokensregex.TokenSequencePattern)5 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)4 Counter (edu.stanford.nlp.stats.Counter)3 TreeReader (edu.stanford.nlp.trees.TreeReader)3 TreeReaderFactory (edu.stanford.nlp.trees.TreeReaderFactory)3 Env (edu.stanford.nlp.ling.tokensregex.Env)2 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)2 SemgrexPattern (edu.stanford.nlp.semgraph.semgrex.SemgrexPattern)2 FrenchTreeReaderFactory (edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory)2 CollectionValuedMap (edu.stanford.nlp.util.CollectionValuedMap)2 Constructor (java.lang.reflect.Constructor)2 InvocationTargetException (java.lang.reflect.InvocationTargetException)2 Entry (java.util.Map.Entry)2 IOUtils (edu.stanford.nlp.io.IOUtils)1 RegExFileFilter (edu.stanford.nlp.io.RegExFileFilter)1 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)1 GoldAnswerAnnotation (edu.stanford.nlp.ling.CoreAnnotations.GoldAnswerAnnotation)1