Search in sources :

Example 6 with CoreLabelTokenFactory

use of edu.stanford.nlp.process.CoreLabelTokenFactory in project varaha by thedatachef.

the class StanfordTokenize method exec.

public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0))
        return null;
    // Output bag
    DataBag bagOfTokens = bagFactory.newDefaultBag();
    StringReader textInput = new StringReader(input.get(0).toString());
    PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), "");
    for (CoreLabel label; ptbt.hasNext(); ) {
        label = (CoreLabel) ptbt.next();
        Tuple termText = tupleFactory.newTuple(label.toString());
        bagOfTokens.add(termText);
    }
    return bagOfTokens;
}
Also used : PTBTokenizer(edu.stanford.nlp.process.PTBTokenizer) CoreLabel(edu.stanford.nlp.ling.CoreLabel) DataBag(org.apache.pig.data.DataBag) CoreLabelTokenFactory(edu.stanford.nlp.process.CoreLabelTokenFactory) StringReader(java.io.StringReader) Tuple(org.apache.pig.data.Tuple)

Example 7 with CoreLabelTokenFactory

use of edu.stanford.nlp.process.CoreLabelTokenFactory in project cogcomp-nlp by CogComp.

the class StanfordParseHandler method buildStanfordSentences.

static List<CoreMap> buildStanfordSentences(TextAnnotation ta) {
    View tokens = ta.getView(ViewNames.TOKENS);
    View sentences = ta.getView(ViewNames.SENTENCE);
    String rawText = ta.getText();
    List<CoreMap> stanfordSentences = new LinkedList<>();
    List<CoreLabel> stanfordTokens = new LinkedList<>();
    int tokIndex = 0;
    int sentIndex = 0;
    Constituent currentSentence = sentences.getConstituents().get(0);
    String sentText = rawText.substring(currentSentence.getStartCharOffset(), currentSentence.getEndCharOffset());
    CoreLabelTokenFactory tf = new CoreLabelTokenFactory();
    for (Constituent tok : tokens.getConstituents()) {
        if (tok.getStartSpan() >= currentSentence.getEndSpan()) {
            CoreMap stanfordSentence = buildStanfordSentence(currentSentence, sentText, sentIndex++, stanfordTokens);
            stanfordSentences.add(stanfordSentence);
            stanfordTokens = new LinkedList<>();
            currentSentence = sentences.getConstituents().get(sentIndex);
            sentText = rawText.substring(currentSentence.getStartCharOffset(), currentSentence.getEndCharOffset());
        }
        int tokStart = tok.getStartCharOffset();
        int tokLength = tok.getEndCharOffset() - tokStart;
        String form = rawText.substring(tokStart, tok.getEndCharOffset());
        CoreLabel stanfordTok = tf.makeToken(form, tokStart, tokLength);
        stanfordTok.setIndex(tokIndex++);
        stanfordTokens.add(stanfordTok);
    }
    // should be one last sentence
    CoreMap stanfordSentence = buildStanfordSentence(currentSentence, sentText, sentIndex, stanfordTokens);
    stanfordSentences.add(stanfordSentence);
    return stanfordSentences;
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreLabelTokenFactory(edu.stanford.nlp.process.CoreLabelTokenFactory) TreeView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TreeView) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) CoreMap(edu.stanford.nlp.util.CoreMap) ArrayCoreMap(edu.stanford.nlp.util.ArrayCoreMap) LinkedList(java.util.LinkedList) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 8 with CoreLabelTokenFactory

use of edu.stanford.nlp.process.CoreLabelTokenFactory in project Anserini by castorini.

the class PyseriniEntryPoint method getAllSentences.

public List<String> getAllSentences(String query, int numHits) throws Exception {
    Map<String, Float> docScore = search(query, numHits);
    Map<String, Float> sentencesMap = new LinkedHashMap<>();
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<String> allSentences = new ArrayList<String>();
    for (Map.Entry<String, Float> doc : docScore.entrySet()) {
        List<Sentence> sentences = indexUtils.getSentDocument(doc.getKey());
        for (Sentence thisSent : sentences) {
            List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(thisSent.text())).tokenize();
            String tokenizedAnswer = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
            allSentences.add(tokenizedAnswer);
        }
    }
    return allSentences;
}
Also used : CoreLabelTokenFactory(edu.stanford.nlp.process.CoreLabelTokenFactory) CoreLabel(edu.stanford.nlp.ling.CoreLabel) StringReader(java.io.StringReader) Sentence(edu.stanford.nlp.simple.Sentence)

Example 9 with CoreLabelTokenFactory

use of edu.stanford.nlp.process.CoreLabelTokenFactory in project Anserini by castorini.

the class RetrieveSentences method getRankedPassagesList.

public List<String> getRankedPassagesList(String query, String index, int hits, int k) throws Exception {
    Map<String, Float> scoredDocs = retrieveDocuments(query, hits);
    Map<String, Float> sentencesMap = new LinkedHashMap<>();
    IndexUtils util = new IndexUtils(index);
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    for (Map.Entry<String, Float> doc : scoredDocs.entrySet()) {
        List<Sentence> sentences = util.getSentDocument(doc.getKey());
        for (Sentence sent : sentences) {
            List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(sent.text())).tokenize();
            String answerTokens = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
            sentencesMap.put(answerTokens, doc.getValue());
        }
    }
    scorer = new IdfPassageScorer(index, k);
    String queryTokens = tokenizerFactory.getTokenizer(new StringReader(query)).tokenize().stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
    scorer.score(queryTokens, sentencesMap);
    List<String> topSentences = new ArrayList<>();
    List<ScoredPassage> topPassages = scorer.extractTopPassages();
    for (ScoredPassage s : topPassages) {
        topSentences.add(s.getSentence() + "\t" + s.getScore());
        System.out.println(s.getSentence() + " " + s.getScore());
    }
    return topSentences;
}
Also used : CoreLabelTokenFactory(edu.stanford.nlp.process.CoreLabelTokenFactory) IdfPassageScorer(io.anserini.qa.passage.IdfPassageScorer) IndexUtils(io.anserini.index.IndexUtils) CoreLabel(edu.stanford.nlp.ling.CoreLabel) StringReader(java.io.StringReader) ScoredPassage(io.anserini.qa.passage.ScoredPassage) Sentence(edu.stanford.nlp.simple.Sentence)

Example 10 with CoreLabelTokenFactory

use of edu.stanford.nlp.process.CoreLabelTokenFactory in project CoreNLP by stanfordnlp.

the class TokenSequenceMatcherITest method testTokenSequenceMatcherConj2.

@Test
public void testTokenSequenceMatcherConj2() throws IOException {
    String content = "The cat is sleeping on the floor.";
    String greedyPattern = "(?: ([]* cat []*) & ([]* sleeping []*))";
    TokenizerFactory tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> tokens = tf.getTokenizer(new StringReader(content)).tokenize();
    TokenSequencePattern seqPattern = TokenSequencePattern.compile(greedyPattern);
    TokenSequenceMatcher matcher = seqPattern.getMatcher(tokens);
    boolean entireMatch = matcher.matches();
    assertTrue(entireMatch);
    boolean match = matcher.find();
    assertTrue(match);
    assertEquals("The cat is sleeping on the floor.", matcher.group());
    String reluctantPattern = "(?: ([]*? cat []*?) & ([]*? sleeping []*?))";
    TokenSequencePattern seqPattern2 = TokenSequencePattern.compile(reluctantPattern);
    TokenSequenceMatcher matcher2 = seqPattern2.getMatcher(tokens);
    match = matcher2.find();
    assertTrue(match);
    assertEquals("The cat is sleeping", matcher2.group());
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) TokenizerFactory(edu.stanford.nlp.process.TokenizerFactory) CoreLabelTokenFactory(edu.stanford.nlp.process.CoreLabelTokenFactory) StringReader(java.io.StringReader) Test(org.junit.Test)

Aggregations

CoreLabelTokenFactory (edu.stanford.nlp.process.CoreLabelTokenFactory)12 CoreLabel (edu.stanford.nlp.ling.CoreLabel)11 StringReader (java.io.StringReader)7 Sentence (edu.stanford.nlp.simple.Sentence)4 CoreMap (edu.stanford.nlp.util.CoreMap)3 ScoredPassage (io.anserini.qa.passage.ScoredPassage)3 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)2 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)2 IndexUtils (io.anserini.index.IndexUtils)2 IdfPassageScorer (io.anserini.qa.passage.IdfPassageScorer)2 BufferedReader (java.io.BufferedReader)2 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)1 TreeView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TreeView)1 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)1 HasWord (edu.stanford.nlp.ling.HasWord)1 TaggedWord (edu.stanford.nlp.ling.TaggedWord)1 CoNLLUReader (edu.stanford.nlp.pipeline.CoNLLUReader)1 CoreMapAttributeAggregator (edu.stanford.nlp.pipeline.CoreMapAttributeAggregator)1 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)1 PTBTokenizer (edu.stanford.nlp.process.PTBTokenizer)1