Search in sources :

Example 11 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class ThreadedParserSlowITest method processFile.

public static List<Tree> processFile(LexicalizedParser parser, List<Tree> input) {
    List<Tree> results = new ArrayList<Tree>();
    for (Tree tree : input) {
        List<HasWord> sentence = tree.yieldHasWord();
        Tree output = parser.parseTree(sentence);
        results.add(output);
        if (results.size() % 10 == 0 || results.size() == input.size()) {
            System.out.println("Processed " + results.size() + " trees");
        }
    }
    return results;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) ArrayList(java.util.ArrayList) Tree(edu.stanford.nlp.trees.Tree)

Example 12 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class WordToTaggedWordProcessor method main.

/**
   * This will print out some text, recognizing tags.  It can be used to
   * test tag breaking.  <br>  Usage: <code>
   * java edu.stanford.nlp.process.WordToTaggedWordProcessor fileOrUrl
   * </code>
   *
   * @param args Command line argument: a file or URL
   */
public static void main(String[] args) {
    if (args.length != 1) {
        System.out.println("usage: java edu.stanford.nlp.process.WordToTaggedWordProcessor fileOrUrl");
        System.exit(0);
    }
    String filename = args[0];
    try {
        Document<HasWord, Word, Word> d;
        if (filename.startsWith("http://")) {
            Document<HasWord, Word, Word> dpre = new BasicDocument<HasWord>().init(new URL(filename));
            DocumentProcessor<Word, Word, HasWord, Word> notags = new StripTagsProcessor<>();
            d = notags.processDocument(dpre);
        } else {
            d = new BasicDocument<HasWord>().init(new File(filename));
        }
        DocumentProcessor<Word, HasWord, HasWord, Word> proc = new WordToTaggedWordProcessor<>();
        Document<HasWord, Word, HasWord> sentd = proc.processDocument(d);
        // System.out.println(sentd);
        int i = 0;
        for (HasWord w : sentd) {
            System.out.println(i + ": " + w);
            i++;
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) Word(edu.stanford.nlp.ling.Word) HasWord(edu.stanford.nlp.ling.HasWord) TaggedWord(edu.stanford.nlp.ling.TaggedWord) BasicDocument(edu.stanford.nlp.ling.BasicDocument) URL(java.net.URL) File(java.io.File)

Example 13 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class DocumentPreprocessor method main.

/**
   * A simple, deterministic sentence-splitter. This method only supports the English
   * tokenizer, so for other languages you should run the tokenizer first and then
   * run this sentence splitter with the "-whitespaceTokenization" option.
   *
   * @param args Command-line arguments
   */
public static void main(String[] args) throws IOException {
    final Properties options = StringUtils.argsToProperties(args, argOptionDefs());
    if (options.containsKey("help")) {
        log.info(usage());
        return;
    }
    // Command-line flags
    String encoding = options.getProperty("encoding", "utf-8");
    boolean printSentenceLengths = PropertiesUtils.getBool(options, "printSentenceLengths", false);
    String xmlElementDelimiter = options.getProperty("xml", null);
    DocType docType = xmlElementDelimiter == null ? DocType.Plain : DocType.XML;
    String sentenceDelimiter = options.containsKey("noTokenization") ? System.getProperty("line.separator") : null;
    String tagDelimiter = options.getProperty("tag", null);
    String[] sentenceDelims = null;
    // Setup the TokenizerFactory
    int numFactoryFlags = 0;
    boolean suppressEscaping = options.containsKey("suppressEscaping");
    if (suppressEscaping)
        numFactoryFlags += 1;
    boolean customTokenizer = options.containsKey("tokenizerOptions");
    if (customTokenizer)
        numFactoryFlags += 1;
    boolean printOriginalText = options.containsKey("printOriginalText");
    if (printOriginalText)
        numFactoryFlags += 1;
    boolean whitespaceTokenization = options.containsKey("whitespaceTokenization");
    if (whitespaceTokenization)
        numFactoryFlags += 1;
    if (numFactoryFlags > 1) {
        log.info("Only one tokenizer flag allowed at a time: ");
        log.info("  -suppressEscaping, -tokenizerOptions, -printOriginalText, -whitespaceTokenization");
        return;
    }
    TokenizerFactory<? extends HasWord> tf = null;
    if (suppressEscaping) {
        tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "ptb3Escaping=false");
    } else if (customTokenizer) {
        tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), options.getProperty("tokenizerOptions"));
    } else if (printOriginalText) {
        tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");
    } else if (whitespaceTokenization) {
        List<String> whitespaceDelims = new ArrayList<>(Arrays.asList(DocumentPreprocessor.DEFAULT_SENTENCE_DELIMS));
        whitespaceDelims.add(WhitespaceLexer.NEWLINE);
        sentenceDelims = whitespaceDelims.toArray(new String[whitespaceDelims.size()]);
    } else {
        tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    }
    String fileList = options.getProperty("", null);
    String[] files = fileList == null ? new String[1] : fileList.split("\\s+");
    int numSents = 0;
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, encoding), true);
    for (String file : files) {
        DocumentPreprocessor docPreprocessor;
        if (file == null || file.isEmpty()) {
            docPreprocessor = new DocumentPreprocessor(new InputStreamReader(System.in, encoding));
        } else {
            docPreprocessor = new DocumentPreprocessor(file, docType, encoding);
        }
        if (docType == DocType.XML) {
            docPreprocessor.setElementDelimiter(xmlElementDelimiter);
        }
        docPreprocessor.setTokenizerFactory(tf);
        if (sentenceDelimiter != null) {
            docPreprocessor.setSentenceDelimiter(sentenceDelimiter);
        }
        if (tagDelimiter != null) {
            docPreprocessor.setTagDelimiter(tagDelimiter);
        }
        if (sentenceDelims != null) {
            docPreprocessor.setSentenceFinalPuncWords(sentenceDelims);
        }
        for (List<HasWord> sentence : docPreprocessor) {
            numSents++;
            if (printSentenceLengths) {
                System.err.printf("Length: %d%n", sentence.size());
            }
            boolean printSpace = false;
            for (HasWord word : sentence) {
                if (printOriginalText) {
                    CoreLabel cl = (CoreLabel) word;
                    if (!printSpace) {
                        pw.print(cl.get(CoreAnnotations.BeforeAnnotation.class));
                        printSpace = true;
                    }
                    pw.print(cl.get(CoreAnnotations.OriginalTextAnnotation.class));
                    pw.print(cl.get(CoreAnnotations.AfterAnnotation.class));
                } else {
                    if (printSpace)
                        pw.print(" ");
                    printSpace = true;
                    pw.print(word.word());
                }
            }
            pw.println();
        }
    }
    pw.close();
    System.err.printf("Read in %d sentences.%n", numSents);
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) InputStreamReader(java.io.InputStreamReader) Properties(java.util.Properties) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ArrayList(java.util.ArrayList) List(java.util.List) OutputStreamWriter(java.io.OutputStreamWriter) PrintWriter(java.io.PrintWriter)

Example 14 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class PTBEscapingProcessor method main.

/**
   * This will do the escaping on an input file. Input file should already be tokenized,
   * with tokens separated by whitespace. <br>
   * Usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl
   *
   * @param args Command line argument: a file or URL
   */
public static void main(String[] args) {
    if (args.length != 1) {
        System.out.println("usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl");
        return;
    }
    String filename = args[0];
    try {
        // initialized below
        Document<String, Word, Word> d;
        if (filename.startsWith("http://")) {
            Document<String, Word, Word> dpre = new BasicDocument<String>(WhitespaceTokenizer.factory()).init(new URL(filename));
            DocumentProcessor<Word, Word, String, Word> notags = new StripTagsProcessor<>();
            d = notags.processDocument(dpre);
        } else {
            d = new BasicDocument<String>(WhitespaceTokenizer.factory()).init(new File(filename));
        }
        DocumentProcessor<Word, HasWord, String, Word> proc = new PTBEscapingProcessor<>();
        Document<String, Word, HasWord> newD = proc.processDocument(d);
        for (HasWord word : newD) {
            System.out.println(word);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) Word(edu.stanford.nlp.ling.Word) HasWord(edu.stanford.nlp.ling.HasWord) BasicDocument(edu.stanford.nlp.ling.BasicDocument) URL(java.net.URL) File(java.io.File)

Example 15 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class SemanticHeadFinder method isVerbalAuxiliary.

private boolean isVerbalAuxiliary(Tree preterminal, Set<String> verbalSet, boolean allowJustTagMatch) {
    if (preterminal.isPreTerminal()) {
        Label kidLabel = preterminal.label();
        String tag = null;
        if (kidLabel instanceof HasTag) {
            tag = ((HasTag) kidLabel).tag();
        }
        if (tag == null) {
            tag = preterminal.value();
        }
        Label wordLabel = preterminal.firstChild().label();
        String word = null;
        if (wordLabel instanceof HasWord) {
            word = ((HasWord) wordLabel).word();
        }
        if (word == null) {
            word = wordLabel.value();
        }
        if (DEBUG) {
            log.info("Checking " + preterminal.value() + " head is " + word + '/' + tag);
        }
        String lcWord = word.toLowerCase();
        if (allowJustTagMatch && unambiguousAuxiliaryTags.contains(tag) || verbalTags.contains(tag) && verbalSet.contains(lcWord)) {
            if (DEBUG) {
                log.info("isAuxiliary found desired type of aux");
            }
            return true;
        }
    }
    return false;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) Label(edu.stanford.nlp.ling.Label) HasTag(edu.stanford.nlp.ling.HasTag)

Aggregations

HasWord (edu.stanford.nlp.ling.HasWord)57 CoreLabel (edu.stanford.nlp.ling.CoreLabel)17 TaggedWord (edu.stanford.nlp.ling.TaggedWord)15 ArrayList (java.util.ArrayList)14 HasTag (edu.stanford.nlp.ling.HasTag)13 Tree (edu.stanford.nlp.trees.Tree)13 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)11 StringReader (java.io.StringReader)11 Label (edu.stanford.nlp.ling.Label)10 Word (edu.stanford.nlp.ling.Word)10 List (java.util.List)8 BufferedReader (java.io.BufferedReader)6 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 File (java.io.File)5 PrintWriter (java.io.PrintWriter)5 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)4 Pair (edu.stanford.nlp.util.Pair)4 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 HasIndex (edu.stanford.nlp.ling.HasIndex)3 Sentence (edu.stanford.nlp.ling.Sentence)3