Search in sources :

Example 1 with Word

use of edu.stanford.nlp.ling.Word in project lucida by claritylab.

the class StanfordParser method getPCFGScore.

/**
	 * Parses a sentence and returns the PCFG score as a confidence measure.
	 * 
	 * @param sentence a sentence
	 * @return PCFG score
	 */
@SuppressWarnings("unchecked")
public static double getPCFGScore(String sentence) {
    if (tlp == null || parser == null)
        throw new RuntimeException("Parser has not been initialized");
    // parse the sentence to produce PCFG score
    log.debug("Parsing sentence");
    double score;
    synchronized (parser) {
        Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
        List<Word> words = tokenizer.tokenize();
        log.debug("Tokenization: " + words);
        parser.parse(new Sentence(words));
        score = parser.getPCFGScore();
    }
    return score;
}
Also used : Word(edu.stanford.nlp.ling.Word) StringReader(java.io.StringReader) Tokenizer(edu.stanford.nlp.process.Tokenizer) Sentence(edu.stanford.nlp.ling.Sentence)

Example 2 with Word

use of edu.stanford.nlp.ling.Word in project lucida by claritylab.

the class StanfordParser method parse.

/**
     * Parses a sentence and returns a string representation of the parse tree.
     * 
     * @param sentence a sentence
     * @return Tree whose Label is a MapLabel containing correct begin and end
     * character offsets in keys BEGIN_KEY and END_KEY
     */
@SuppressWarnings("unchecked")
public static String parse(String sentence) {
    if (tlp == null || parser == null)
        throw new RuntimeException("Parser has not been initialized");
    // parse the sentence to produce stanford Tree
    log.debug("Parsing sentence");
    Tree tree = null;
    synchronized (parser) {
        Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
        List<Word> words = tokenizer.tokenize();
        log.debug("Tokenization: " + words);
        parser.parse(new Sentence(words));
        tree = parser.getBestParse();
    }
    return tree.toString().replaceAll(" \\[[\\S]+\\]", "");
}
Also used : Word(edu.stanford.nlp.ling.Word) StringReader(java.io.StringReader) Tree(edu.stanford.nlp.trees.Tree) Tokenizer(edu.stanford.nlp.process.Tokenizer) Sentence(edu.stanford.nlp.ling.Sentence)

Example 3 with Word

use of edu.stanford.nlp.ling.Word in project lucida by claritylab.

the class StanfordPosTagger method createSentence.

/**
	 * Combines the tokens into a <code>Sentence</code> 
	 * 
	 * @param tokens
	 * @return <code>Sentence</code> made of the tokens
	 */
@SuppressWarnings("unchecked")
private static Sentence createSentence(String[] tokens) {
    ArrayList<HasWord> wordList = new ArrayList<HasWord>();
    for (String s : tokens) {
        HasWord w = new Word(s);
        wordList.add(w);
    }
    Sentence sentence = new Sentence();
    sentence.setWords(wordList);
    return sentence;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) Word(edu.stanford.nlp.ling.Word) HasWord(edu.stanford.nlp.ling.HasWord) ArrayList(java.util.ArrayList) Sentence(edu.stanford.nlp.ling.Sentence)

Example 4 with Word

use of edu.stanford.nlp.ling.Word in project CoreNLP by stanfordnlp.

the class WordToTaggedWordProcessor method main.

/**
   * This will print out some text, recognizing tags.  It can be used to
   * test tag breaking.  <br>  Usage: <code>
   * java edu.stanford.nlp.process.WordToTaggedWordProcessor fileOrUrl
   * </code>
   *
   * @param args Command line argument: a file or URL
   */
public static void main(String[] args) {
    if (args.length != 1) {
        System.out.println("usage: java edu.stanford.nlp.process.WordToTaggedWordProcessor fileOrUrl");
        System.exit(0);
    }
    String filename = args[0];
    try {
        Document<HasWord, Word, Word> d;
        if (filename.startsWith("http://")) {
            Document<HasWord, Word, Word> dpre = new BasicDocument<HasWord>().init(new URL(filename));
            DocumentProcessor<Word, Word, HasWord, Word> notags = new StripTagsProcessor<>();
            d = notags.processDocument(dpre);
        } else {
            d = new BasicDocument<HasWord>().init(new File(filename));
        }
        DocumentProcessor<Word, HasWord, HasWord, Word> proc = new WordToTaggedWordProcessor<>();
        Document<HasWord, Word, HasWord> sentd = proc.processDocument(d);
        // System.out.println(sentd);
        int i = 0;
        for (HasWord w : sentd) {
            System.out.println(i + ": " + w);
            i++;
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) Word(edu.stanford.nlp.ling.Word) HasWord(edu.stanford.nlp.ling.HasWord) TaggedWord(edu.stanford.nlp.ling.TaggedWord) BasicDocument(edu.stanford.nlp.ling.BasicDocument) URL(java.net.URL) File(java.io.File)

Example 5 with Word

use of edu.stanford.nlp.ling.Word in project CoreNLP by stanfordnlp.

the class StripTagsProcessor method process.

/**
   * Returns a new Document with the same meta-data as <tt>in</tt>,
   * and the same words except tags are stripped.
   */
public List<Word> process(List<? extends Word> in) {
    List<Word> out = new ArrayList<>();
    // to prevent contiguous newlines
    boolean justInsertedNewline = false;
    for (Word w : in) {
        String ws = w.word();
        if (ws.startsWith("<") && ws.endsWith(">")) {
            if (markLineBreaks && !justInsertedNewline) {
                // finds start and end of tag name (ignores brackets and /)
                // e.g. <p>, <br/>, or </table>
                //       se   s e        s    e
                int tagStartIndex = 1;
                while (tagStartIndex < ws.length() && !Character.isLetter(ws.charAt(tagStartIndex))) {
                    tagStartIndex++;
                }
                if (tagStartIndex == ws.length()) {
                    // no tag text
                    continue;
                }
                int tagEndIndex = ws.length() - 1;
                while (tagEndIndex > tagStartIndex && !Character.isLetterOrDigit(ws.charAt(tagEndIndex))) {
                    tagEndIndex--;
                }
                // looks up tag name in list of known block-level tags
                String tagName = ws.substring(tagStartIndex, tagEndIndex + 1).toLowerCase();
                if (blockTags.contains(tagName)) {
                    // mark newline for block-level tags
                    out.add(new Word("\n"));
                    justInsertedNewline = true;
                }
            }
        } else {
            // normal word
            out.add(w);
            justInsertedNewline = false;
        }
    }
    return out;
}
Also used : Word(edu.stanford.nlp.ling.Word) ArrayList(java.util.ArrayList)

Aggregations

Word (edu.stanford.nlp.ling.Word)40 HasWord (edu.stanford.nlp.ling.HasWord)15 TaggedWord (edu.stanford.nlp.ling.TaggedWord)10 CoreLabel (edu.stanford.nlp.ling.CoreLabel)9 ArrayList (java.util.ArrayList)9 Tree (edu.stanford.nlp.trees.Tree)8 StringReader (java.io.StringReader)7 IndexedWord (edu.stanford.nlp.ling.IndexedWord)5 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)4 Label (edu.stanford.nlp.ling.Label)4 CorefCoreAnnotations (edu.stanford.nlp.coref.CorefCoreAnnotations)3 CorefChain (edu.stanford.nlp.coref.data.CorefChain)3 Span (edu.stanford.nlp.ie.machinereading.structure.Span)3 RelationTriple (edu.stanford.nlp.ie.util.RelationTriple)3 CategoryWordTag (edu.stanford.nlp.ling.CategoryWordTag)3 CoreAnnotation (edu.stanford.nlp.ling.CoreAnnotation)3 Sentence (edu.stanford.nlp.ling.Sentence)3 File (java.io.File)3 PrintWriter (java.io.PrintWriter)3 Dictionaries (edu.stanford.nlp.coref.data.Dictionaries)2