use of edu.stanford.nlp.ling.Word in project lucida by claritylab.
the class StanfordParser method getPCFGScore.
/**
* Parses a sentence and returns the PCFG score as a confidence measure.
*
* @param sentence a sentence
* @return PCFG score
*/
@SuppressWarnings("unchecked")
public static double getPCFGScore(String sentence) {
if (tlp == null || parser == null)
throw new RuntimeException("Parser has not been initialized");
// parse the sentence to produce PCFG score
log.debug("Parsing sentence");
double score;
synchronized (parser) {
Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
List<Word> words = tokenizer.tokenize();
log.debug("Tokenization: " + words);
parser.parse(new Sentence(words));
score = parser.getPCFGScore();
}
return score;
}
use of edu.stanford.nlp.ling.Word in project lucida by claritylab.
the class StanfordParser method parse.
/**
* Parses a sentence and returns a string representation of the parse tree.
*
* @param sentence a sentence
* @return Tree whose Label is a MapLabel containing correct begin and end
* character offsets in keys BEGIN_KEY and END_KEY
*/
@SuppressWarnings("unchecked")
public static String parse(String sentence) {
if (tlp == null || parser == null)
throw new RuntimeException("Parser has not been initialized");
// parse the sentence to produce stanford Tree
log.debug("Parsing sentence");
Tree tree = null;
synchronized (parser) {
Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
List<Word> words = tokenizer.tokenize();
log.debug("Tokenization: " + words);
parser.parse(new Sentence(words));
tree = parser.getBestParse();
}
return tree.toString().replaceAll(" \\[[\\S]+\\]", "");
}
use of edu.stanford.nlp.ling.Word in project lucida by claritylab.
the class StanfordPosTagger method createSentence.
/**
* Combines the tokens into a <code>Sentence</code>
*
* @param tokens
* @return <code>Sentence</code> made of the tokens
*/
@SuppressWarnings("unchecked")
private static Sentence createSentence(String[] tokens) {
ArrayList<HasWord> wordList = new ArrayList<HasWord>();
for (String s : tokens) {
HasWord w = new Word(s);
wordList.add(w);
}
Sentence sentence = new Sentence();
sentence.setWords(wordList);
return sentence;
}
use of edu.stanford.nlp.ling.Word in project CoreNLP by stanfordnlp.
the class WordToTaggedWordProcessor method main.
/**
* This will print out some text, recognizing tags. It can be used to
* test tag breaking. <br> Usage: <code>
* java edu.stanford.nlp.process.WordToTaggedWordProcessor fileOrUrl
* </code>
*
* @param args Command line argument: a file or URL
*/
public static void main(String[] args) {
if (args.length != 1) {
System.out.println("usage: java edu.stanford.nlp.process.WordToTaggedWordProcessor fileOrUrl");
System.exit(0);
}
String filename = args[0];
try {
Document<HasWord, Word, Word> d;
if (filename.startsWith("http://")) {
Document<HasWord, Word, Word> dpre = new BasicDocument<HasWord>().init(new URL(filename));
DocumentProcessor<Word, Word, HasWord, Word> notags = new StripTagsProcessor<>();
d = notags.processDocument(dpre);
} else {
d = new BasicDocument<HasWord>().init(new File(filename));
}
DocumentProcessor<Word, HasWord, HasWord, Word> proc = new WordToTaggedWordProcessor<>();
Document<HasWord, Word, HasWord> sentd = proc.processDocument(d);
// System.out.println(sentd);
int i = 0;
for (HasWord w : sentd) {
System.out.println(i + ": " + w);
i++;
}
} catch (Exception e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.ling.Word in project CoreNLP by stanfordnlp.
the class StripTagsProcessor method process.
/**
* Returns a new Document with the same meta-data as <tt>in</tt>,
* and the same words except tags are stripped.
*/
public List<Word> process(List<? extends Word> in) {
List<Word> out = new ArrayList<>();
// to prevent contiguous newlines
boolean justInsertedNewline = false;
for (Word w : in) {
String ws = w.word();
if (ws.startsWith("<") && ws.endsWith(">")) {
if (markLineBreaks && !justInsertedNewline) {
// finds start and end of tag name (ignores brackets and /)
// e.g. <p>, <br/>, or </table>
// se s e s e
int tagStartIndex = 1;
while (tagStartIndex < ws.length() && !Character.isLetter(ws.charAt(tagStartIndex))) {
tagStartIndex++;
}
if (tagStartIndex == ws.length()) {
// no tag text
continue;
}
int tagEndIndex = ws.length() - 1;
while (tagEndIndex > tagStartIndex && !Character.isLetterOrDigit(ws.charAt(tagEndIndex))) {
tagEndIndex--;
}
// looks up tag name in list of known block-level tags
String tagName = ws.substring(tagStartIndex, tagEndIndex + 1).toLowerCase();
if (blockTags.contains(tagName)) {
// mark newline for block-level tags
out.add(new Word("\n"));
justInsertedNewline = true;
}
}
} else {
// normal word
out.add(w);
justInsertedNewline = false;
}
}
return out;
}
Aggregations