Search in sources :

Example 1 with CoreLabelTokenFactory

use of edu.stanford.nlp.process.CoreLabelTokenFactory in project Anserini by castorini.

the class PyseriniEntryPoint method getRankedPassages.

public List<String> getRankedPassages(String query, int numHits, int k) throws Exception {
    Map<String, Float> docScore = search(query, numHits);
    Map<String, Float> sentencesMap = new LinkedHashMap<>();
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    for (Map.Entry<String, Float> doc : docScore.entrySet()) {
        List<Sentence> sentences = indexUtils.getSentDocument(doc.getKey());
        for (Sentence thisSent : sentences) {
            // tokenize the sentences
            List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(thisSent.text())).tokenize();
            String answerTokens = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
            sentencesMap.put(answerTokens, doc.getValue());
        }
    }
    passageScorer = new IdfPassageScorer(indexDir, k);
    String queryTokens = tokenizerFactory.getTokenizer(new StringReader(query)).tokenize().stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
    passageScorer.score(query, sentencesMap);
    List<String> topSentences = new ArrayList<>();
    List<ScoredPassage> topPassages = passageScorer.extractTopPassages();
    for (ScoredPassage s : topPassages) {
        topSentences.add(s.getSentence() + "\t" + s.getScore());
    }
    return topSentences;
}
Also used : CoreLabelTokenFactory(edu.stanford.nlp.process.CoreLabelTokenFactory) IdfPassageScorer(io.anserini.qa.passage.IdfPassageScorer) CoreLabel(edu.stanford.nlp.ling.CoreLabel) StringReader(java.io.StringReader) ScoredPassage(io.anserini.qa.passage.ScoredPassage) Sentence(edu.stanford.nlp.simple.Sentence)

Example 2 with CoreLabelTokenFactory

use of edu.stanford.nlp.process.CoreLabelTokenFactory in project Anserini by castorini.

the class RetrieveSentences method getRankedPassages.

public void getRankedPassages(Args args) throws Exception {
    Map<String, Float> scoredDocs = retrieveDocuments(args.query, args.hits);
    Map<String, Float> sentencesMap = new LinkedHashMap<>();
    IndexUtils util = new IndexUtils(args.index);
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    for (Map.Entry<String, Float> doc : scoredDocs.entrySet()) {
        List<Sentence> sentences = util.getSentDocument(doc.getKey());
        for (Sentence sent : sentences) {
            List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(sent.text())).tokenize();
            String answerTokens = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
            sentencesMap.put(answerTokens, doc.getValue());
        }
    }
    String queryTokens = tokenizerFactory.getTokenizer(new StringReader(args.query)).tokenize().stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
    scorer.score(queryTokens, sentencesMap);
    List<ScoredPassage> topPassages = scorer.extractTopPassages();
    for (ScoredPassage s : topPassages) {
        System.out.println(s.getSentence() + " " + s.getScore());
    }
}
Also used : CoreLabelTokenFactory(edu.stanford.nlp.process.CoreLabelTokenFactory) IndexUtils(io.anserini.index.IndexUtils) CoreLabel(edu.stanford.nlp.ling.CoreLabel) StringReader(java.io.StringReader) ScoredPassage(io.anserini.qa.passage.ScoredPassage) Sentence(edu.stanford.nlp.simple.Sentence)

Example 3 with CoreLabelTokenFactory

use of edu.stanford.nlp.process.CoreLabelTokenFactory in project CoreNLP by stanfordnlp.

the class CoNLLReadingITest method loadConllFileOriginal.

public static void loadConllFileOriginal(String inFile, List<CoreMap> sents, List<DependencyTree> trees, boolean unlabeled, boolean cPOS) {
    CoreLabelTokenFactory tf = new CoreLabelTokenFactory(false);
    try (BufferedReader reader = IOUtils.readerFromString(inFile)) {
        List<CoreLabel> sentenceTokens = new ArrayList<>();
        DependencyTree tree = new DependencyTree();
        for (String line : IOUtils.getLineIterable(reader, false)) {
            String[] splits = line.split("\t");
            if (splits.length < 10) {
                if (sentenceTokens.size() > 0) {
                    trees.add(tree);
                    CoreMap sentence = new CoreLabel();
                    sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
                    sents.add(sentence);
                    tree = new DependencyTree();
                    sentenceTokens = new ArrayList<>();
                }
            } else {
                String word = splits[1], pos = cPOS ? splits[3] : splits[4], depType = splits[7];
                int head = -1;
                try {
                    head = Integer.parseInt(splits[6]);
                } catch (NumberFormatException e) {
                    continue;
                }
                CoreLabel token = tf.makeToken(word, 0, 0);
                token.setTag(pos);
                token.set(CoreAnnotations.CoNLLDepParentIndexAnnotation.class, head);
                token.set(CoreAnnotations.CoNLLDepTypeAnnotation.class, depType);
                sentenceTokens.add(token);
                if (!unlabeled)
                    tree.add(head, depType);
                else
                    tree.add(head, Config.UNKNOWN);
            }
        }
    } catch (IOException e) {
        throw new RuntimeIOException(e);
    }
}
Also used : RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) CoreLabelTokenFactory(edu.stanford.nlp.process.CoreLabelTokenFactory) ArrayList(java.util.ArrayList) IOException(java.io.IOException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) CoreLabel(edu.stanford.nlp.ling.CoreLabel) BufferedReader(java.io.BufferedReader) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 4 with CoreLabelTokenFactory

use of edu.stanford.nlp.process.CoreLabelTokenFactory in project CoreNLP by stanfordnlp.

the class ParserDemo method demoAPI.

/**
 * demoAPI demonstrates other ways of calling the parser with
 * already tokenized text, or in some cases, raw text that needs to
 * be tokenized as a single sentence.  Output is handled with a
 * TreePrint object.  Note that the options used when creating the
 * TreePrint can determine what results to print out.  Once again,
 * one can capture the output by passing a PrintWriter to
 * TreePrint.printTree. This code is for English.
 */
public static void demoAPI(LexicalizedParser lp) {
    // This option shows parsing a list of correctly tokenized words
    String[] sent = { "This", "is", "an", "easy", "sentence", "." };
    List<CoreLabel> rawWords = SentenceUtils.toCoreLabelList(sent);
    Tree parse = lp.apply(rawWords);
    parse.pennPrint();
    System.out.println();
    // This option shows loading and using an explicit tokenizer
    String sent2 = "This is another sentence.";
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sent2));
    List<CoreLabel> rawWords2 = tok.tokenize();
    parse = lp.apply(rawWords2);
    // PennTreebankLanguagePack for English
    TreebankLanguagePack tlp = lp.treebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    System.out.println(tdl);
    System.out.println();
    // You can also use a TreePrint object to print trees and dependencies
    TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
    tp.printTree(parse);
}
Also used : CoreLabelTokenFactory(edu.stanford.nlp.process.CoreLabelTokenFactory) CoreLabel(edu.stanford.nlp.ling.CoreLabel) StringReader(java.io.StringReader)

Example 5 with CoreLabelTokenFactory

use of edu.stanford.nlp.process.CoreLabelTokenFactory in project CoreNLP by stanfordnlp.

the class TaggerDemo2 method main.

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        log.info("usage: java TaggerDemo2 modelFile fileToTag");
        return;
    }
    MaxentTagger tagger = new MaxentTagger(args[0]);
    TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
    BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
    for (List<HasWord> sentence : documentPreprocessor) {
        List<TaggedWord> tSentence = tagger.tagSentence(sentence);
        pw.println(SentenceUtils.listToString(tSentence, false));
    }
    // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
    List<HasWord> sent = SentenceUtils.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
    List<TaggedWord> taggedSent = tagger.tagSentence(sent);
    for (TaggedWord tw : taggedSent) {
        if (tw.tag().startsWith("JJ")) {
            pw.println(tw.word());
        }
    }
    pw.close();
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) CoreLabelTokenFactory(edu.stanford.nlp.process.CoreLabelTokenFactory) InputStreamReader(java.io.InputStreamReader) FileInputStream(java.io.FileInputStream) CoreLabel(edu.stanford.nlp.ling.CoreLabel) TaggedWord(edu.stanford.nlp.ling.TaggedWord) MaxentTagger(edu.stanford.nlp.tagger.maxent.MaxentTagger) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor) PrintWriter(java.io.PrintWriter)

Aggregations

CoreLabelTokenFactory (edu.stanford.nlp.process.CoreLabelTokenFactory)12 CoreLabel (edu.stanford.nlp.ling.CoreLabel)11 StringReader (java.io.StringReader)7 Sentence (edu.stanford.nlp.simple.Sentence)4 CoreMap (edu.stanford.nlp.util.CoreMap)3 ScoredPassage (io.anserini.qa.passage.ScoredPassage)3 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)2 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)2 IndexUtils (io.anserini.index.IndexUtils)2 IdfPassageScorer (io.anserini.qa.passage.IdfPassageScorer)2 BufferedReader (java.io.BufferedReader)2 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)1 TreeView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TreeView)1 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)1 HasWord (edu.stanford.nlp.ling.HasWord)1 TaggedWord (edu.stanford.nlp.ling.TaggedWord)1 CoNLLUReader (edu.stanford.nlp.pipeline.CoNLLUReader)1 CoreMapAttributeAggregator (edu.stanford.nlp.pipeline.CoreMapAttributeAggregator)1 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)1 PTBTokenizer (edu.stanford.nlp.process.PTBTokenizer)1