Search in sources :

Example 41 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class FindTreebankTree method main.

public static void main(String[] args) {
    // Args specified with -tagSeparator, -encoding, etc are assigned
    // to the appropriate option.  Otherwise, the first arg found is
    // the sentence to look for, and all other args are paths in which
    // to look for that sentence.
    String needle = "";
    String tagSeparator = "_";
    String encoding = "utf-8";
    String fileRegex = "";
    List<String> paths = new ArrayList<>();
    for (int i = 0; i < args.length; ++i) {
        if ((args[i].equalsIgnoreCase("-tagSeparator") || args[i].equalsIgnoreCase("--tagSeparator")) && i + 1 < args.length) {
            tagSeparator = args[i + 1];
            ++i;
        } else if ((args[i].equalsIgnoreCase("-encoding") || args[i].equalsIgnoreCase("--encoding")) && i + 1 < args.length) {
            encoding = args[i + 1];
            ++i;
        } else if ((args[i].equalsIgnoreCase("-fileRegex") || args[i].equalsIgnoreCase("--fileRegex")) && i + 1 < args.length) {
            fileRegex = args[i + 1];
            ++i;
        } else if (needle.equals("")) {
            needle = args[i].trim();
        } else {
            paths.add(args[i]);
        }
    }
    TreeReaderFactory trf = new LabeledScoredTreeReaderFactory();
    // If the user specified a regex, here we make a filter using that
    // regex.  We just use an anonymous class for the filter
    FileFilter filter = null;
    if (!fileRegex.equals("")) {
        final Pattern filePattern = Pattern.compile(fileRegex);
        filter = pathname -> (pathname.isDirectory() || filePattern.matcher(pathname.getName()).matches());
    }
    for (String path : paths) {
        // Start a new treebank with the given path, encoding, filter, etc
        DiskTreebank treebank = new DiskTreebank(trf, encoding);
        treebank.loadPath(path, filter);
        Iterator<Tree> treeIterator = treebank.iterator();
        int treeCount = 0;
        String currentFile = "";
        while (treeIterator.hasNext()) {
            // keep track of which file we are currently looking at
            if (!currentFile.equals(treebank.getCurrentFilename())) {
                currentFile = treebank.getCurrentFilename();
                treeCount = 0;
            }
            ++treeCount;
            Tree tree = treeIterator.next();
            List<TaggedWord> sentence = tree.taggedYield();
            boolean found = false;
            // The tree can match in one of three ways: tagged, untagged,
            // or untagged and unsegmented (which is useful for Chinese,
            // for example)
            String haystack = SentenceUtils.listToString(sentence, true);
            found = needle.equals(haystack);
            haystack = haystack.replaceAll(" ", "");
            found = found || needle.equals(haystack);
            haystack = SentenceUtils.listToString(sentence, false, tagSeparator);
            found = found || needle.equals(haystack);
            if (found) {
                System.out.println("needle found in " + currentFile + " tree " + treeCount);
            }
        }
    }
}
Also used : Pattern(java.util.regex.Pattern) ArrayList(java.util.ArrayList) TaggedWord(edu.stanford.nlp.ling.TaggedWord) FileFilter(java.io.FileFilter)

Example 42 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class TreeLemmatizer method transformTree.

@Override
public Tree transformTree(Tree t) {
    Morphology morphology = new Morphology();
    List<TaggedWord> tagged = null;
    int index = 0;
    for (Tree leaf : t.getLeaves()) {
        Label label = leaf.label();
        if (label == null) {
            continue;
        }
        String tag;
        if (!(label instanceof HasTag) || ((HasTag) label).tag() == null) {
            if (tagged == null) {
                tagged = t.taggedYield();
            }
            tag = tagged.get(index).tag();
        } else {
            tag = ((HasTag) label).tag();
        }
        if (!(label instanceof HasLemma)) {
            throw new IllegalArgumentException("Got a tree with labels which do not support lemma");
        }
        ((HasLemma) label).setLemma(morphology.lemma(label.value(), tag, true));
        ++index;
    }
    return t;
}
Also used : HasLemma(edu.stanford.nlp.ling.HasLemma) TaggedWord(edu.stanford.nlp.ling.TaggedWord) Morphology(edu.stanford.nlp.process.Morphology) Label(edu.stanford.nlp.ling.Label) HasTag(edu.stanford.nlp.ling.HasTag)

Example 43 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class TSVTaggedFileReaderTest method testReadBackwards.

public void testReadBackwards() throws IOException {
    File file = createTestFile();
    TaggedFileRecord record = createRecord(file, "tagColumn=0,wordColumn=1,");
    List<List<TaggedWord>> sentences = new ArrayList<List<TaggedWord>>();
    for (List<TaggedWord> sentence : record.reader()) {
        sentences.add(sentence);
    }
    assertEquals(3, sentences.size());
    assertEquals(3, sentences.get(0).size());
    assertEquals("A", sentences.get(0).get(0).tag());
    assertEquals("B", sentences.get(0).get(1).tag());
    assertEquals("C", sentences.get(0).get(2).tag());
    assertEquals("D", sentences.get(1).get(0).tag());
    assertEquals("E", sentences.get(1).get(1).tag());
    assertEquals("F", sentences.get(2).get(0).tag());
    assertEquals("1", sentences.get(0).get(0).word());
    assertEquals("2", sentences.get(0).get(1).word());
    assertEquals("3", sentences.get(0).get(2).word());
    assertEquals("4", sentences.get(1).get(0).word());
    assertEquals("5", sentences.get(1).get(1).word());
    assertEquals("6", sentences.get(2).get(0).word());
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) File(java.io.File)

Aggregations

TaggedWord (edu.stanford.nlp.ling.TaggedWord)43 HasWord (edu.stanford.nlp.ling.HasWord)9 CoreLabel (edu.stanford.nlp.ling.CoreLabel)5 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)5 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 Tree (edu.stanford.nlp.trees.Tree)5 ArrayList (java.util.ArrayList)5 Label (edu.stanford.nlp.ling.Label)4 WordTag (edu.stanford.nlp.ling.WordTag)4 List (java.util.List)4 HasTag (edu.stanford.nlp.ling.HasTag)3 TaggedFileRecord (edu.stanford.nlp.tagger.io.TaggedFileRecord)3 File (java.io.File)3 StringReader (java.io.StringReader)3 Word (edu.stanford.nlp.ling.Word)2 Morphology (edu.stanford.nlp.process.Morphology)2 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)2 GrammaticalStructure (edu.stanford.nlp.trees.GrammaticalStructure)2 BufferedReader (java.io.BufferedReader)2 IOException (java.io.IOException)2