use of edu.stanford.nlp.process.CoreLabelTokenFactory in project varaha by thedatachef.
the class StanfordTokenize method exec.
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() < 1 || input.isNull(0))
return null;
// Output bag
DataBag bagOfTokens = bagFactory.newDefaultBag();
StringReader textInput = new StringReader(input.get(0).toString());
PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), "");
for (CoreLabel label; ptbt.hasNext(); ) {
label = (CoreLabel) ptbt.next();
Tuple termText = tupleFactory.newTuple(label.toString());
bagOfTokens.add(termText);
}
return bagOfTokens;
}
use of edu.stanford.nlp.process.CoreLabelTokenFactory in project cogcomp-nlp by CogComp.
the class StanfordParseHandler method buildStanfordSentences.
static List<CoreMap> buildStanfordSentences(TextAnnotation ta) {
View tokens = ta.getView(ViewNames.TOKENS);
View sentences = ta.getView(ViewNames.SENTENCE);
String rawText = ta.getText();
List<CoreMap> stanfordSentences = new LinkedList<>();
List<CoreLabel> stanfordTokens = new LinkedList<>();
int tokIndex = 0;
int sentIndex = 0;
Constituent currentSentence = sentences.getConstituents().get(0);
String sentText = rawText.substring(currentSentence.getStartCharOffset(), currentSentence.getEndCharOffset());
CoreLabelTokenFactory tf = new CoreLabelTokenFactory();
for (Constituent tok : tokens.getConstituents()) {
if (tok.getStartSpan() >= currentSentence.getEndSpan()) {
CoreMap stanfordSentence = buildStanfordSentence(currentSentence, sentText, sentIndex++, stanfordTokens);
stanfordSentences.add(stanfordSentence);
stanfordTokens = new LinkedList<>();
currentSentence = sentences.getConstituents().get(sentIndex);
sentText = rawText.substring(currentSentence.getStartCharOffset(), currentSentence.getEndCharOffset());
}
int tokStart = tok.getStartCharOffset();
int tokLength = tok.getEndCharOffset() - tokStart;
String form = rawText.substring(tokStart, tok.getEndCharOffset());
CoreLabel stanfordTok = tf.makeToken(form, tokStart, tokLength);
stanfordTok.setIndex(tokIndex++);
stanfordTokens.add(stanfordTok);
}
// should be one last sentence
CoreMap stanfordSentence = buildStanfordSentence(currentSentence, sentText, sentIndex, stanfordTokens);
stanfordSentences.add(stanfordSentence);
return stanfordSentences;
}
use of edu.stanford.nlp.process.CoreLabelTokenFactory in project Anserini by castorini.
the class PyseriniEntryPoint method getAllSentences.
public List<String> getAllSentences(String query, int numHits) throws Exception {
Map<String, Float> docScore = search(query, numHits);
Map<String, Float> sentencesMap = new LinkedHashMap<>();
TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
List<String> allSentences = new ArrayList<String>();
for (Map.Entry<String, Float> doc : docScore.entrySet()) {
List<Sentence> sentences = indexUtils.getSentDocument(doc.getKey());
for (Sentence thisSent : sentences) {
List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(thisSent.text())).tokenize();
String tokenizedAnswer = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
allSentences.add(tokenizedAnswer);
}
}
return allSentences;
}
use of edu.stanford.nlp.process.CoreLabelTokenFactory in project Anserini by castorini.
the class RetrieveSentences method getRankedPassagesList.
public List<String> getRankedPassagesList(String query, String index, int hits, int k) throws Exception {
Map<String, Float> scoredDocs = retrieveDocuments(query, hits);
Map<String, Float> sentencesMap = new LinkedHashMap<>();
IndexUtils util = new IndexUtils(index);
TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
for (Map.Entry<String, Float> doc : scoredDocs.entrySet()) {
List<Sentence> sentences = util.getSentDocument(doc.getKey());
for (Sentence sent : sentences) {
List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(sent.text())).tokenize();
String answerTokens = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
sentencesMap.put(answerTokens, doc.getValue());
}
}
scorer = new IdfPassageScorer(index, k);
String queryTokens = tokenizerFactory.getTokenizer(new StringReader(query)).tokenize().stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
scorer.score(queryTokens, sentencesMap);
List<String> topSentences = new ArrayList<>();
List<ScoredPassage> topPassages = scorer.extractTopPassages();
for (ScoredPassage s : topPassages) {
topSentences.add(s.getSentence() + "\t" + s.getScore());
System.out.println(s.getSentence() + " " + s.getScore());
}
return topSentences;
}
use of edu.stanford.nlp.process.CoreLabelTokenFactory in project CoreNLP by stanfordnlp.
the class TokenSequenceMatcherITest method testTokenSequenceMatcherConj2.
@Test
public void testTokenSequenceMatcherConj2() throws IOException {
String content = "The cat is sleeping on the floor.";
String greedyPattern = "(?: ([]* cat []*) & ([]* sleeping []*))";
TokenizerFactory tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
List<CoreLabel> tokens = tf.getTokenizer(new StringReader(content)).tokenize();
TokenSequencePattern seqPattern = TokenSequencePattern.compile(greedyPattern);
TokenSequenceMatcher matcher = seqPattern.getMatcher(tokens);
boolean entireMatch = matcher.matches();
assertTrue(entireMatch);
boolean match = matcher.find();
assertTrue(match);
assertEquals("The cat is sleeping on the floor.", matcher.group());
String reluctantPattern = "(?: ([]*? cat []*?) & ([]*? sleeping []*?))";
TokenSequencePattern seqPattern2 = TokenSequencePattern.compile(reluctantPattern);
TokenSequenceMatcher matcher2 = seqPattern2.getMatcher(tokens);
match = matcher2.find();
assertTrue(match);
assertEquals("The cat is sleeping", matcher2.group());
}
Aggregations