Search in sources :

Example 1 with DataInstanceDep

use of edu.stanford.nlp.patterns.dep.DataInstanceDep in project CoreNLP by stanfordnlp.

the class ScorePhrasesLearnFeatWt method chooseUnknownPhrases.

Set<CandidatePhrase> chooseUnknownPhrases(DataInstance sent, Random random, double perSelect, Class positiveClass, String label, int maxNum) {
    Set<CandidatePhrase> unknownSamples = new HashSet<>();
    if (maxNum == 0)
        return unknownSamples;
    Function<CoreLabel, Boolean> acceptWord = coreLabel -> {
        if (coreLabel.get(positiveClass).equals(label) || constVars.functionWords.contains(coreLabel.word()))
            return false;
        else
            return true;
    };
    Random r = new Random(0);
    List<Integer> lengths = new ArrayList<>();
    for (int i = 1; i <= PatternFactory.numWordsCompoundMapped.get(label); i++) lengths.add(i);
    int length = CollectionUtils.sample(lengths, r);
    if (constVars.patternType.equals(PatternFactory.PatternType.DEP)) {
        ExtractPhraseFromPattern extract = new ExtractPhraseFromPattern(true, length);
        SemanticGraph g = ((DataInstanceDep) sent).getGraph();
        Collection<CoreLabel> sampledHeads = CollectionUtils.sampleWithoutReplacement(sent.getTokens(), Math.min(maxNum, (int) (perSelect * sent.getTokens().size())), random);
        //TODO: change this for more efficient implementation
        List<String> textTokens = sent.getTokens().stream().map(x -> x.word()).collect(Collectors.toList());
        for (CoreLabel l : sampledHeads) {
            if (!acceptWord.apply(l))
                continue;
            IndexedWord w = g.getNodeByIndex(l.index());
            List<String> outputPhrases = new ArrayList<>();
            List<ExtractedPhrase> extractedPhrases = new ArrayList<>();
            List<IntPair> outputIndices = new ArrayList<>();
            extract.printSubGraph(g, w, new ArrayList<>(), textTokens, outputPhrases, outputIndices, new ArrayList<>(), new ArrayList<>(), false, extractedPhrases, null, acceptWord);
            for (ExtractedPhrase p : extractedPhrases) {
                unknownSamples.add(CandidatePhrase.createOrGet(p.getValue(), null, p.getFeatures()));
            }
        }
    } else if (constVars.patternType.equals(PatternFactory.PatternType.SURFACE)) {
        CoreLabel[] tokens = sent.getTokens().toArray(new CoreLabel[0]);
        for (int i = 0; i < tokens.length; i++) {
            if (random.nextDouble() < perSelect) {
                int left = (int) ((length - 1) / 2.0);
                int right = length - 1 - left;
                String ph = "";
                boolean haspositive = false;
                for (int j = Math.max(0, i - left); j < tokens.length && j <= i + right; j++) {
                    if (tokens[j].get(positiveClass).equals(label)) {
                        haspositive = true;
                        break;
                    }
                    ph += " " + tokens[j].word();
                }
                ph = ph.trim();
                if (!haspositive && !ph.trim().isEmpty() && !constVars.functionWords.contains(ph)) {
                    unknownSamples.add(CandidatePhrase.createOrGet(ph));
                }
            }
        }
    } else
        throw new RuntimeException("not yet implemented");
    return unknownSamples;
}
Also used : java.util(java.util) ExtractPhraseFromPattern(edu.stanford.nlp.patterns.dep.ExtractPhraseFromPattern) edu.stanford.nlp.util(edu.stanford.nlp.util) ConcurrentHashCounter(edu.stanford.nlp.util.concurrent.ConcurrentHashCounter) Function(java.util.function.Function) edu.stanford.nlp.stats(edu.stanford.nlp.stats) AtomicDouble(edu.stanford.nlp.util.concurrent.AtomicDouble) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) RVFDatum(edu.stanford.nlp.ling.RVFDatum) Option(edu.stanford.nlp.util.ArgumentParser.Option) IndexedWord(edu.stanford.nlp.ling.IndexedWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ScorePhraseMeasures(edu.stanford.nlp.patterns.ConstantsAndVariables.ScorePhraseMeasures) DataInstanceDep(edu.stanford.nlp.patterns.dep.DataInstanceDep) BufferedWriter(java.io.BufferedWriter) java.util.concurrent(java.util.concurrent) IOUtils(edu.stanford.nlp.io.IOUtils) Redwood(edu.stanford.nlp.util.logging.Redwood) FileWriter(java.io.FileWriter) BasicDatum(edu.stanford.nlp.ling.BasicDatum) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) File(java.io.File) ExtractedPhrase(edu.stanford.nlp.patterns.dep.ExtractedPhrase) edu.stanford.nlp.classify(edu.stanford.nlp.classify) Entry(java.util.Map.Entry) ExtractedPhrase(edu.stanford.nlp.patterns.dep.ExtractedPhrase) DataInstanceDep(edu.stanford.nlp.patterns.dep.DataInstanceDep) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ExtractPhraseFromPattern(edu.stanford.nlp.patterns.dep.ExtractPhraseFromPattern) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Aggregations

edu.stanford.nlp.classify (edu.stanford.nlp.classify)1 IOUtils (edu.stanford.nlp.io.IOUtils)1 BasicDatum (edu.stanford.nlp.ling.BasicDatum)1 CoreLabel (edu.stanford.nlp.ling.CoreLabel)1 IndexedWord (edu.stanford.nlp.ling.IndexedWord)1 RVFDatum (edu.stanford.nlp.ling.RVFDatum)1 ScorePhraseMeasures (edu.stanford.nlp.patterns.ConstantsAndVariables.ScorePhraseMeasures)1 DataInstanceDep (edu.stanford.nlp.patterns.dep.DataInstanceDep)1 ExtractPhraseFromPattern (edu.stanford.nlp.patterns.dep.ExtractPhraseFromPattern)1 ExtractedPhrase (edu.stanford.nlp.patterns.dep.ExtractedPhrase)1 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)1 edu.stanford.nlp.stats (edu.stanford.nlp.stats)1 edu.stanford.nlp.util (edu.stanford.nlp.util)1 Option (edu.stanford.nlp.util.ArgumentParser.Option)1 AtomicDouble (edu.stanford.nlp.util.concurrent.AtomicDouble)1 ConcurrentHashCounter (edu.stanford.nlp.util.concurrent.ConcurrentHashCounter)1 Redwood (edu.stanford.nlp.util.logging.Redwood)1 BufferedWriter (java.io.BufferedWriter)1 File (java.io.File)1 FileWriter (java.io.FileWriter)1