use of edu.stanford.nlp.process.CoreLabelTokenFactory in project Anserini by castorini.
the class PyseriniEntryPoint method getRankedPassages.
public List<String> getRankedPassages(String query, int numHits, int k) throws Exception {
Map<String, Float> docScore = search(query, numHits);
Map<String, Float> sentencesMap = new LinkedHashMap<>();
TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
for (Map.Entry<String, Float> doc : docScore.entrySet()) {
List<Sentence> sentences = indexUtils.getSentDocument(doc.getKey());
for (Sentence thisSent : sentences) {
// tokenize the sentences
List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(thisSent.text())).tokenize();
String answerTokens = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
sentencesMap.put(answerTokens, doc.getValue());
}
}
passageScorer = new IdfPassageScorer(indexDir, k);
String queryTokens = tokenizerFactory.getTokenizer(new StringReader(query)).tokenize().stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
passageScorer.score(query, sentencesMap);
List<String> topSentences = new ArrayList<>();
List<ScoredPassage> topPassages = passageScorer.extractTopPassages();
for (ScoredPassage s : topPassages) {
topSentences.add(s.getSentence() + "\t" + s.getScore());
}
return topSentences;
}
use of edu.stanford.nlp.process.CoreLabelTokenFactory in project Anserini by castorini.
the class RetrieveSentences method getRankedPassages.
public void getRankedPassages(Args args) throws Exception {
Map<String, Float> scoredDocs = retrieveDocuments(args.query, args.hits);
Map<String, Float> sentencesMap = new LinkedHashMap<>();
IndexUtils util = new IndexUtils(args.index);
TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
for (Map.Entry<String, Float> doc : scoredDocs.entrySet()) {
List<Sentence> sentences = util.getSentDocument(doc.getKey());
for (Sentence sent : sentences) {
List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(sent.text())).tokenize();
String answerTokens = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
sentencesMap.put(answerTokens, doc.getValue());
}
}
String queryTokens = tokenizerFactory.getTokenizer(new StringReader(args.query)).tokenize().stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
scorer.score(queryTokens, sentencesMap);
List<ScoredPassage> topPassages = scorer.extractTopPassages();
for (ScoredPassage s : topPassages) {
System.out.println(s.getSentence() + " " + s.getScore());
}
}
use of edu.stanford.nlp.process.CoreLabelTokenFactory in project CoreNLP by stanfordnlp.
the class CoNLLReadingITest method loadConllFileOriginal.
public static void loadConllFileOriginal(String inFile, List<CoreMap> sents, List<DependencyTree> trees, boolean unlabeled, boolean cPOS) {
CoreLabelTokenFactory tf = new CoreLabelTokenFactory(false);
try (BufferedReader reader = IOUtils.readerFromString(inFile)) {
List<CoreLabel> sentenceTokens = new ArrayList<>();
DependencyTree tree = new DependencyTree();
for (String line : IOUtils.getLineIterable(reader, false)) {
String[] splits = line.split("\t");
if (splits.length < 10) {
if (sentenceTokens.size() > 0) {
trees.add(tree);
CoreMap sentence = new CoreLabel();
sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
sents.add(sentence);
tree = new DependencyTree();
sentenceTokens = new ArrayList<>();
}
} else {
String word = splits[1], pos = cPOS ? splits[3] : splits[4], depType = splits[7];
int head = -1;
try {
head = Integer.parseInt(splits[6]);
} catch (NumberFormatException e) {
continue;
}
CoreLabel token = tf.makeToken(word, 0, 0);
token.setTag(pos);
token.set(CoreAnnotations.CoNLLDepParentIndexAnnotation.class, head);
token.set(CoreAnnotations.CoNLLDepTypeAnnotation.class, depType);
sentenceTokens.add(token);
if (!unlabeled)
tree.add(head, depType);
else
tree.add(head, Config.UNKNOWN);
}
}
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
use of edu.stanford.nlp.process.CoreLabelTokenFactory in project CoreNLP by stanfordnlp.
the class ParserDemo method demoAPI.
/**
* demoAPI demonstrates other ways of calling the parser with
* already tokenized text, or in some cases, raw text that needs to
* be tokenized as a single sentence. Output is handled with a
* TreePrint object. Note that the options used when creating the
* TreePrint can determine what results to print out. Once again,
* one can capture the output by passing a PrintWriter to
* TreePrint.printTree. This code is for English.
*/
public static void demoAPI(LexicalizedParser lp) {
// This option shows parsing a list of correctly tokenized words
String[] sent = { "This", "is", "an", "easy", "sentence", "." };
List<CoreLabel> rawWords = SentenceUtils.toCoreLabelList(sent);
Tree parse = lp.apply(rawWords);
parse.pennPrint();
System.out.println();
// This option shows loading and using an explicit tokenizer
String sent2 = "This is another sentence.";
TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sent2));
List<CoreLabel> rawWords2 = tok.tokenize();
parse = lp.apply(rawWords2);
// PennTreebankLanguagePack for English
TreebankLanguagePack tlp = lp.treebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
System.out.println();
// You can also use a TreePrint object to print trees and dependencies
TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
tp.printTree(parse);
}
use of edu.stanford.nlp.process.CoreLabelTokenFactory in project CoreNLP by stanfordnlp.
the class TaggerDemo2 method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
log.info("usage: java TaggerDemo2 modelFile fileToTag");
return;
}
MaxentTagger tagger = new MaxentTagger(args[0]);
TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
for (List<HasWord> sentence : documentPreprocessor) {
List<TaggedWord> tSentence = tagger.tagSentence(sentence);
pw.println(SentenceUtils.listToString(tSentence, false));
}
// print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
List<HasWord> sent = SentenceUtils.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
List<TaggedWord> taggedSent = tagger.tagSentence(sent);
for (TaggedWord tw : taggedSent) {
if (tw.tag().startsWith("JJ")) {
pw.println(tw.word());
}
}
pw.close();
}
Aggregations