Search in sources :

Example 11 with TreeReader

use of edu.stanford.nlp.trees.TreeReader in project CoreNLP by stanfordnlp.

the class CreateClauseDataset method processDirectory.

/**
   * Process all the trees in the given directory. For example, the WSJ section of the Penn Treebank.
   *
   * @param name The name of the directory we are processing.
   * @param directory The directory we are processing.
   * @return A dataset of subject/object pairs in the trees in the directory.
   *         This is a list of sentences, such that each sentence has a collection of pairs of spans.
   *         Each pair of spans is a subject/object span pair that constitutes a valid extraction.
   * @throws IOException
   */
private static List<Pair<CoreMap, Collection<Pair<Span, Span>>>> processDirectory(String name, File directory) throws IOException {
    forceTrack("Processing " + name);
    // Prepare the files to iterate over
    Iterable<File> files = IOUtils.iterFilesRecursive(directory, "mrg");
    Tree tree;
    int numTreesProcessed = 0;
    List<Pair<CoreMap, Collection<Pair<Span, Span>>>> trainingData = new ArrayList<>(1024);
    // Iterate over the files
    for (File file : files) {
        //      log(file);
        TreeReader reader = new PennTreeReader(IOUtils.readerFromFile(file));
        while ((tree = reader.readTree()) != null) {
            try {
                // Prepare the tree
                tree.indexSpans();
                tree.setSpans();
                // Get relevant information from sentence
                List<CoreLabel> tokens = tree.getLeaves().stream().map(leaf -> (CoreLabel) leaf.label()).collect(Collectors.toList());
                SemanticGraph graph = parse(tree);
                Map<Integer, Span> targets = findTraceTargets(tree);
                Map<Integer, Integer> sources = findTraceSources(tree);
                // Create a sentence object
                CoreMap sentence = new ArrayCoreMap(4) {

                    {
                        set(CoreAnnotations.TokensAnnotation.class, tokens);
                        set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, graph);
                        set(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class, graph);
                        set(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class, graph);
                    }
                };
                natlog.doOneSentence(null, sentence);
                // Generate training data
                Collection<Pair<Span, Span>> trainingDataFromSentence = subjectObjectPairs(graph, tokens, targets, sources);
                trainingData.add(Pair.makePair(sentence, trainingDataFromSentence));
                // Debug print
                numTreesProcessed += 1;
                if (numTreesProcessed % 100 == 0) {
                    log("[" + new DecimalFormat("00000").format(numTreesProcessed) + "] " + countDatums(trainingData) + " known extractions");
                }
            } catch (Throwable t) {
                t.printStackTrace();
            }
        }
    }
    // End
    log("" + numTreesProcessed + " trees processed yielding " + countDatums(trainingData) + " known extractions");
    endTrack("Processing " + name);
    return trainingData;
}
Also used : TreeReader(edu.stanford.nlp.trees.TreeReader) java.util(java.util) edu.stanford.nlp.util(edu.stanford.nlp.util) Tree(edu.stanford.nlp.trees.Tree) SemgrexMatcher(edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher) UniversalEnglishGrammaticalStructureFactory(edu.stanford.nlp.trees.UniversalEnglishGrammaticalStructureFactory) Matcher(java.util.regex.Matcher) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) IndexedWord(edu.stanford.nlp.ling.IndexedWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) IOUtils(edu.stanford.nlp.io.IOUtils) Redwood(edu.stanford.nlp.util.logging.Redwood) PennTreeReader(edu.stanford.nlp.trees.PennTreeReader) DecimalFormat(java.text.DecimalFormat) Util(edu.stanford.nlp.util.logging.Redwood.Util) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) File(java.io.File) Span(edu.stanford.nlp.ie.machinereading.structure.Span) Annotation(edu.stanford.nlp.pipeline.Annotation) HasIndex(edu.stanford.nlp.ling.HasIndex) TSVSentenceProcessor(edu.stanford.nlp.process.TSVSentenceProcessor) Pattern(java.util.regex.Pattern) InputStream(java.io.InputStream) DecimalFormat(java.text.DecimalFormat) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) TreeReader(edu.stanford.nlp.trees.TreeReader) PennTreeReader(edu.stanford.nlp.trees.PennTreeReader) Span(edu.stanford.nlp.ie.machinereading.structure.Span) CoreLabel(edu.stanford.nlp.ling.CoreLabel) PennTreeReader(edu.stanford.nlp.trees.PennTreeReader) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) Tree(edu.stanford.nlp.trees.Tree) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) File(java.io.File)

Example 12 with TreeReader

use of edu.stanford.nlp.trees.TreeReader in project CoreNLP by stanfordnlp.

the class MWETreeVisitorExternal method main.

/**
   * For debugging.
   * 
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 1) {
        System.err.printf("Usage: java %s atb_tree_file > atb_tree_file.out%n", MWETreeVisitorExternal.class.getName());
        System.exit(-1);
    }
    TreeReaderFactory trf = new ArabicTreeReaderFactory();
    try {
        TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "UTF-8")));
        TreeVisitor visitor = new MWETreeVisitorExternal();
        int treeId = 0;
        for (Tree tree; (tree = tr.readTree()) != null; ++treeId) {
            if (tree.value().equals("ROOT")) {
                // Skip over the ROOT tag
                tree = tree.firstChild();
            }
            visitor.visitTree(tree);
            System.out.println(tree.toString());
        }
        tr.close();
        System.err.printf("Processed %d trees.%n", treeId);
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) FileNotFoundException(java.io.FileNotFoundException) TreeReader(edu.stanford.nlp.trees.TreeReader) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) ArabicTreeReaderFactory(edu.stanford.nlp.trees.international.arabic.ArabicTreeReaderFactory) TreeVisitor(edu.stanford.nlp.trees.TreeVisitor) BufferedReader(java.io.BufferedReader) Tree(edu.stanford.nlp.trees.Tree) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) ArabicTreeReaderFactory(edu.stanford.nlp.trees.international.arabic.ArabicTreeReaderFactory)

Example 13 with TreeReader

use of edu.stanford.nlp.trees.TreeReader in project CoreNLP by stanfordnlp.

the class MWEFrequencyDist method main.

public static void main(String[] args) {
    if (args.length != 1) {
        System.err.printf("Usage: java %s file%n", MWEFrequencyDist.class.getName());
        System.exit(-1);
    }
    final File treeFile = new File(args[0]);
    TwoDimensionalCounter<String, String> mweLabelToString = new TwoDimensionalCounter<>();
    Set<String> uniquePOSSequences = Generics.newHashSet();
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
        TreeReaderFactory trf = new FrenchTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        final TregexPattern pMWE = TregexPattern.compile("/^MW/");
        for (Tree t; (t = tr.readTree()) != null; ) {
            //Count MWE statistics
            TregexMatcher m = pMWE.matcher(t);
            while (m.findNextMatchingNode()) {
                Tree match = m.getMatch();
                String label = match.value();
                List<CoreLabel> yield = match.taggedLabeledYield();
                StringBuilder termYield = new StringBuilder();
                StringBuilder posYield = new StringBuilder();
                for (CoreLabel cl : yield) {
                    termYield.append(cl.word()).append(" ");
                    posYield.append(cl.tag()).append(" ");
                }
                mweLabelToString.incrementCount(label, termYield.toString().trim());
                uniquePOSSequences.add(posYield.toString().trim());
            }
        }
        //Closes the underlying reader
        tr.close();
        System.out.printf("Type\t#Type\t#Single\t%%Single\t%%Total%n");
        double nMWEs = mweLabelToString.totalCount();
        int nAllSingletons = 0;
        int nTokens = 0;
        for (String mweLabel : mweLabelToString.firstKeySet()) {
            int nSingletons = 0;
            double totalCount = mweLabelToString.totalCount(mweLabel);
            Counter<String> mc = mweLabelToString.getCounter(mweLabel);
            for (String term : mc.keySet()) {
                if (mc.getCount(term) == 1.0)
                    nSingletons++;
                nTokens += term.split("\\s+").length * (int) mc.getCount(term);
            }
            nAllSingletons += nSingletons;
            System.out.printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int) totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs);
        }
        System.out.printf("TOTAL:\t%d\t%d\t%.2f%n", (int) nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs);
        System.out.println("#tokens = " + nTokens);
        System.out.println("#unique MWE POS sequences = " + uniquePOSSequences.size());
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (TregexParseException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) TreeReader(edu.stanford.nlp.trees.TreeReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) TregexParseException(edu.stanford.nlp.trees.tregex.TregexParseException) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) InputStreamReader(java.io.InputStreamReader) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CoreLabel(edu.stanford.nlp.ling.CoreLabel) BufferedReader(java.io.BufferedReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) File(java.io.File)

Example 14 with TreeReader

use of edu.stanford.nlp.trees.TreeReader in project CoreNLP by stanfordnlp.

the class MungeTreesWithMorfetteAnalyses method main.

/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.printf("Usage: java %s tree_file morfette_tnt_file%n", MungeTreesWithMorfetteAnalyses.class.getName());
        System.exit(-1);
    }
    String treeFile = args[0];
    String morfetteFile = args[1];
    TreeReaderFactory trf = new FrenchTreeReaderFactory();
    try {
        TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
        Iterator<List<CoreLabel>> morfetteItr = new MorfetteFileIterator(morfetteFile);
        for (Tree tree; (tree = tr.readTree()) != null && morfetteItr.hasNext(); ) {
            List<CoreLabel> analysis = morfetteItr.next();
            List<Label> yield = tree.yield();
            assert analysis.size() == yield.size();
            int yieldLen = yield.size();
            for (int i = 0; i < yieldLen; ++i) {
                CoreLabel tokenAnalysis = analysis.get(i);
                Label token = yield.get(i);
                String lemma = getLemma(token.value(), tokenAnalysis.lemma());
                String newLeaf = String.format("%s%s%s%s%s", token.value(), MorphoFeatureSpecification.MORPHO_MARK, lemma, MorphoFeatureSpecification.LEMMA_MARK, tokenAnalysis.tag());
                ((CoreLabel) token).setValue(newLeaf);
            }
            System.out.println(tree.toString());
        }
        if (tr.readTree() != null || morfetteItr.hasNext()) {
            log.info("WARNING: Uneven input files!");
        }
        tr.close();
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) FileNotFoundException(java.io.FileNotFoundException) TreeReader(edu.stanford.nlp.trees.TreeReader) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CoreLabel(edu.stanford.nlp.ling.CoreLabel) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) BufferedReader(java.io.BufferedReader) Tree(edu.stanford.nlp.trees.Tree) ArrayList(java.util.ArrayList) List(java.util.List) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory)

Example 15 with TreeReader

use of edu.stanford.nlp.trees.TreeReader in project CoreNLP by stanfordnlp.

the class TreeToMorfette method main.

/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 1) {
        System.err.printf("Usage: java %s tree_file%n", TreeToMorfette.class.getName());
        System.exit(-1);
    }
    String treeFile = args[0];
    TreeReaderFactory trf = new FrenchTreeReaderFactory();
    try {
        TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
        for (Tree tree1; (tree1 = tr.readTree()) != null; ) {
            List<Label> pretermYield = tree1.preTerminalYield();
            List<Label> yield = tree1.yield();
            int yieldLen = yield.size();
            for (int i = 0; i < yieldLen; ++i) {
                CoreLabel rawToken = (CoreLabel) yield.get(i);
                String word = rawToken.value();
                String morphStr = rawToken.originalText();
                Pair<String, String> lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, morphStr);
                String lemma = lemmaMorph.first();
                String morph = lemmaMorph.second();
                if (morph == null || morph.equals("") || morph.equals("XXX")) {
                    morph = ((CoreLabel) pretermYield.get(i)).value();
                }
                System.out.printf("%s %s %s%n", word, lemma, morph);
            }
            System.out.println();
        }
        tr.close();
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) FileNotFoundException(java.io.FileNotFoundException) TreeReader(edu.stanford.nlp.trees.TreeReader) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CoreLabel(edu.stanford.nlp.ling.CoreLabel) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) BufferedReader(java.io.BufferedReader) Tree(edu.stanford.nlp.trees.Tree) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory)

Aggregations

TreeReader (edu.stanford.nlp.trees.TreeReader)20 Tree (edu.stanford.nlp.trees.Tree)19 TreeReaderFactory (edu.stanford.nlp.trees.TreeReaderFactory)17 IOException (java.io.IOException)7 CoreLabel (edu.stanford.nlp.ling.CoreLabel)6 FrenchTreeReaderFactory (edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory)6 FileInputStream (java.io.FileInputStream)6 InputStreamReader (java.io.InputStreamReader)6 BufferedReader (java.io.BufferedReader)5 FileNotFoundException (java.io.FileNotFoundException)5 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 TwoDimensionalCounter (edu.stanford.nlp.stats.TwoDimensionalCounter)3 PennTreeReader (edu.stanford.nlp.trees.PennTreeReader)3 TreebankLanguagePack (edu.stanford.nlp.trees.TreebankLanguagePack)3 SpanishTreeReaderFactory (edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory)3 Pattern (java.util.regex.Pattern)3 Label (edu.stanford.nlp.ling.Label)2 LabeledScoredTreeReaderFactory (edu.stanford.nlp.trees.LabeledScoredTreeReaderFactory)2 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)2 ArabicTreeReaderFactory (edu.stanford.nlp.trees.international.arabic.ArabicTreeReaderFactory)2