use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.
the class MaxentTagger method runTagger.
/**
* This method runs the tagger on the provided reader and writer.
*
* It takes input from the given {@code reader}, applies the
* tagger to it one sentence at a time (determined using
* documentPreprocessor), and writes the output to the given
* {@code writer}.
*
* The document is broken into sentences using the sentence
* processor determined in the tagger's TaggerConfig.
*
* {@code tagInside} makes the tagger run in XML mode.... If set
* to non-empty, instead of processing the document as one large
* text blob, it considers each region in between the given tag to
* be a separate text blob.
*/
public void runTagger(BufferedReader reader, BufferedWriter writer, String tagInside, OutputStyle outputStyle) throws IOException {
String sentenceDelimiter = config.getSentenceDelimiter();
if (sentenceDelimiter != null && sentenceDelimiter.equals("newline")) {
sentenceDelimiter = "\n";
}
final TokenizerFactory<? extends HasWord> tokenizerFactory = chooseTokenizerFactory();
// Now we do everything through the doc preprocessor
final DocumentPreprocessor docProcessor;
if (tagInside.length() > 0) {
docProcessor = new DocumentPreprocessor(reader, DocumentPreprocessor.DocType.XML);
docProcessor.setElementDelimiter(tagInside);
} else {
docProcessor = new DocumentPreprocessor(reader);
docProcessor.setSentenceDelimiter(sentenceDelimiter);
}
if (config.keepEmptySentences()) {
docProcessor.setKeepEmptySentences(true);
}
docProcessor.setTokenizerFactory(tokenizerFactory);
runTagger(docProcessor, writer, outputStyle);
}
use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.
the class TaggerDemo2 method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
log.info("usage: java TaggerDemo2 modelFile fileToTag");
return;
}
MaxentTagger tagger = new MaxentTagger(args[0]);
TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
for (List<HasWord> sentence : documentPreprocessor) {
List<TaggedWord> tSentence = tagger.tagSentence(sentence);
pw.println(SentenceUtils.listToString(tSentence, false));
}
// print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
List<HasWord> sent = SentenceUtils.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
List<TaggedWord> taggedSent = tagger.tagSentence(sent);
for (TaggedWord tw : taggedSent) {
if (tw.tag().startsWith("JJ")) {
pw.println(tw.word());
}
}
pw.close();
}
use of edu.stanford.nlp.process.DocumentPreprocessor in project textdb by TextDB.
the class NlpSplitOperator method computeSentenceList.
private List<Span> computeSentenceList(Tuple inputTuple) {
String inputText = inputTuple.<IField>getField(predicate.getInputAttributeName()).getValue().toString();
Reader reader = new StringReader(inputText);
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(reader);
List<Span> sentenceList = new ArrayList<Span>();
int start = 0;
int end = 0;
String key = PropertyNameConstants.NLP_SPLIT_KEY;
String attributeName = predicate.getInputAttributeName();
for (List<HasWord> sentence : documentPreprocessor) {
String sentenceText = Sentence.listToString(sentence);
//Make span
end = start + sentenceText.length();
Span span = new Span(attributeName, start, end, key, sentenceText);
sentenceList.add(span);
start = end + 1;
}
return sentenceList;
}
use of edu.stanford.nlp.process.DocumentPreprocessor in project uuusa by aghie.
the class Processor method process.
public List<SentimentDependencyGraph> process(String text) {
// HashMap<String, String> emoLookupTable = new HashMap<String,String>();
// for (String emoticon : emoticons){
// System.out.println(emoticon);
// String emouuid = UUID.randomUUID().toString();
// text.replaceAll(emoticon, emouuid);
// emoLookupTable.put(emouuid, emoticon);
// }
List<SentimentDependencyGraph> sdgs = new ArrayList<SentimentDependencyGraph>();
DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text.concat(" ")));
dp.setTokenizerFactory(PTBTokenizer.factory(new WordTokenFactory(), "ptb3Escaping=false"));
for (List<HasWord> sentence : dp) {
List<String> words = sentence.stream().map(w -> w.toString()).collect(Collectors.toList());
// System.out.println("text: "+text);
List<String> tokens = this.tokenizer.tokenize(String.join(" ", words));
// System.out.println("tokens: "+tokens);
List<TaggedTokenInformation> ttis = this.tagger.tag(tokens);
sdgs.add(this.parser.parse(ttis));
}
// this.parser.parse(ttis);
return sdgs;
}
use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.
the class MaxentTagger method tokenizeText.
/**
* Reads data from r, tokenizes it with the given tokenizer, and
* returns a List of Lists of (extends) HasWord objects, which can then be
* fed into tagSentence.
*
* @param r Reader where untokenized text is read
* @param tokenizerFactory Tokenizer. This can be {@code null} in which case
* the default English tokenizer (PTBTokenizerFactory) is used.
* @return List of tokenized sentences
*/
public static List<List<HasWord>> tokenizeText(Reader r, TokenizerFactory<? extends HasWord> tokenizerFactory) {
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
if (tokenizerFactory != null) {
documentPreprocessor.setTokenizerFactory(tokenizerFactory);
}
List<List<HasWord>> out = Generics.newArrayList();
for (List<HasWord> item : documentPreprocessor) {
out.add(item);
}
return out;
}
Aggregations