Search in sources :

Example 1 with Sentence

use of edu.stanford.nlp.simple.Sentence in project CoreNLP by stanfordnlp.

the class TSVSentenceIteratorITest method testFullTokens.

@Test
public void testFullTokens() {
    List<List<String>> entries = new ArrayList<>();
    entries.add(new ArrayList<String>() {

        {
            add("3424");
            add("d2-s1-a1");
            add("0");
            add("{Chess,is,not,a,predominantly,physical,sport,\"\",\"\",yet,neither,are,shooting,and,curling,-LRB-,which,\"\",\"\",in,fact,\"\",\"\",has,been,nicknamed,``,chess,on,ice,'',5,-RRB-,.}");
            add("{chess,be,not,a,predominantly,physical,sport,\"\",\"\",yet,neither,be,shooting,and,curling,-lrb-,which,\"\",\"\",in,fact,\"\",\"\",have,be,nickname,``,chess,on,ice,'',5,-rrb-,.}");
            add("{NN,VBZ,RB,DT,RB,JJ,NN,\"\",\"\",RB,DT,VBP,JJ,CC,NN,-LRB-,WDT,\"\",\"\",IN,NN,\"\",\"\",VBZ,VBN,VBN,``,NN,IN,NN,'',LS,-RRB-,.}");
            add("{O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,NUMBER,O,O}");
            add("{0,6,9,13,15,29,38,43,45,49,57,61,70,74,82,83,88,90,93,97,99,103,108,118,119,125,128,131,132,133,134}");
            add("{5,8,12,14,28,37,43,44,48,56,60,69,73,81,83,88,89,92,97,98,102,107,117,119,124,127,131,132,133,134,135}");
            //add("[{\"\"dependent\"\": 7, \"\"dep\"\": \"\"ROOT\"\", \"\"governorGloss\"\": \"\"ROOT\"\", \"\"governor\"\": 0, \"\"dependentGloss\"\": \"\"sport\"\"}, {\"\"dependent\"\": 1, \"\"dep\"\": \"\"nsubj\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"Chess\"\"}, {\"\"dependent\"\": 2, \"\"dep\"\": \"\"cop\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"is\"\"}, {\"\"dependent\"\": 3, \"\"dep\"\": \"\"neg\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"not\"\"}, {\"\"dependent\"\": 4, \"\"dep\"\": \"\"det\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"a\"\"}, {\"\"dependent\"\": 5, \"\"dep\"\": \"\"advmod\"\", \"\"governorGloss\"\": \"\"physical\"\", \"\"governor\"\": 6, \"\"dependentGloss\"\": \"\"predominantly\"\"}, {\"\"dependent\"\": 6, \"\"dep\"\": \"\"amod\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"physical\"\"}, {\"\"dependent\"\": 9, \"\"dep\"\": \"\"advmod\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"yet\"\"}, {\"\"dependent\"\": 10, \"\"dep\"\": \"\"nsubj\"\", \"\"governorGloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentGloss\"\": \"\"neither\"\"}, {\"\"dependent\"\": 11, \"\"dep\"\": \"\"cop\"\", \"\"governorGloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentGloss\"\": \"\"are\"\"}, {\"\"dependent\"\": 12, \"\"dep\"\": \"\"parataxis\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"shooting\"\"}, {\"\"dependent\"\": 13, \"\"dep\"\": \"\"cc\"\", \"\"governorGloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentGloss\"\": \"\"and\"\"}, {\"\"dependent\"\": 14, \"\"dep\"\": \"\"parataxis\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"curling\"\"}, {\"\"dependent\"\": 14, \"\"dep\"\": \"\"conj:and\"\", \"\"governorGloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentGloss\"\": \"\"curling\"\"}, {\"\"dependent\"\": 16, \"\"dep\"\": \"\"nsubjpass\"\", \"\"governorGloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentGloss\"\": \"\"which\"\"}, {\"\"dependent\"\": 18, \"\"dep\"\": \"\"case\"\", \"\"governorGloss\"\": \"\"fact\"\", \"\"governor\"\": 19, \"\"dependentGloss\"\": \"\"in\"\"}, {\"\"dependent\"\": 19, \"\"dep\"\": \"\"nmod:in\"\", \"\"governorGloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentGloss\"\": \"\"fact\"\"}, {\"\"dependent\"\": 21, \"\"dep\"\": \"\"aux\"\", \"\"governorGloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentGloss\"\": \"\"has\"\"}, {\"\"dependent\"\": 22, \"\"dep\"\": \"\"auxpass\"\", \"\"governorGloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentGloss\"\": \"\"been\"\"}, {\"\"dependent\"\": 23, \"\"dep\"\": \"\"dep\"\", \"\"governorGloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentGloss\"\": \"\"nicknamed\"\"}, {\"\"dependent\"\": 25, \"\"dep\"\": \"\"dobj\"\", \"\"governorGloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentGloss\"\": \"\"chess\"\"}, {\"\"dependent\"\": 26, \"\"dep\"\": \"\"case\"\", \"\"governorGloss\"\": \"\"ice\"\", \"\"governor\"\": 27, \"\"dependentGloss\"\": \"\"on\"\"}, {\"\"dependent\"\": 27, \"\"dep\"\": \"\"nmod:on\"\", \"\"governorGloss\"\": \"\"chess\"\", \"\"governor\"\": 25, \"\"dependentGloss\"\": \"\"ice\"\"}, {\"\"dependent\"\": 29, \"\"dep\"\": \"\"amod\"\", \"\"governorGloss\"\": \"\"chess\"\", \"\"governor\"\": 25, \"\"dependentGloss\"\": \"\"5\"\"}]");
            add("Chess is not a predominantly physical sport, yet neither are shooting and curling (which, in fact, has been nicknamed “chess on ice”5).");
        }
    });
    TSVSentenceIterator it = new TSVSentenceIterator(entries.iterator(), new ArrayList<SentenceField>() {

        {
            add(SentenceField.ID);
            add(SentenceField.DOC_ID);
            add(SentenceField.SENTENCE_INDEX);
            add(SentenceField.WORDS);
            add(SentenceField.LEMMAS);
            add(SentenceField.POS_TAGS);
            add(SentenceField.NER_TAGS);
            add(SentenceField.DOC_CHAR_BEGIN);
            add(SentenceField.DOC_CHAR_END);
            add(SentenceField.GLOSS);
        }
    });
    Sentence sentence = it.next();
    Assert.assertEquals("3424", sentence.sentenceid().orElse("-1"));
    Assert.assertEquals("d2-s1-a1", sentence.document.docid().orElse("???"));
    Assert.assertEquals(0, sentence.sentenceIndex());
    Assert.assertEquals("Chess is not a predominantly physical sport, yet neither are shooting and curling (which, in fact, has been nicknamed “chess on ice”5).", sentence.text());
    Assert.assertArrayEquals(new String[] { "Chess", "is", "not", "a", "predominantly", "physical", "sport", ",", "yet", "neither", "are", "shooting", "and", "curling", "-LRB-", "which", ",", "in", "fact", ",", "has", "been", "nicknamed", "``", "chess", "on", "ice", "''", "5", "-RRB-", "." }, sentence.words().toArray());
    Assert.assertArrayEquals(new String[] { "chess", "be", "not", "a", "predominantly", "physical", "sport", ",", "yet", "neither", "be", "shooting", "and", "curling", "-lrb-", "which", ",", "in", "fact", ",", "have", "be", "nickname", "``", "chess", "on", "ice", "''", "5", "-rrb-", "." }, sentence.lemmas().toArray());
    Assert.assertArrayEquals(new String[] { "NN", "VBZ", "RB", "DT", "RB", "JJ", "NN", ",", "RB", "DT", "VBP", "JJ", "CC", "NN", "-LRB-", "WDT", ",", "IN", "NN", ",", "VBZ", "VBN", "VBN", "``", "NN", "IN", "NN", "''", "LS", "-RRB-", "." }, sentence.posTags().toArray());
    Assert.assertArrayEquals(new String[] { "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "NUMBER", "O", "O" }, sentence.nerTags().toArray());
    Assert.assertArrayEquals(new Integer[] { 0, 6, 9, 13, 15, 29, 38, 43, 45, 49, 57, 61, 70, 74, 82, 83, 88, 90, 93, 97, 99, 103, 108, 118, 119, 125, 128, 131, 132, 133, 134 }, sentence.characterOffsetBegin().toArray());
    Assert.assertArrayEquals(new Integer[] { 5, 8, 12, 14, 28, 37, 43, 44, 48, 56, 60, 69, 73, 81, 83, 88, 89, 92, 97, 98, 102, 107, 117, 119, 124, 127, 131, 132, 133, 134, 135 }, sentence.characterOffsetEnd().toArray());
}
Also used : ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) SentenceField(edu.stanford.nlp.util.TSVSentenceIterator.SentenceField) Sentence(edu.stanford.nlp.simple.Sentence) Test(org.junit.Test)

Example 2 with Sentence

use of edu.stanford.nlp.simple.Sentence in project CoreNLP by stanfordnlp.

the class TSVSentenceIteratorITest method testParseTrees.

@Test
public void testParseTrees() {
    List<List<String>> entries = new ArrayList<>();
    entries.add(new ArrayList<String>() {

        {
            add("3424");
            add("d2-s1-a1");
            add("0");
            add("{Chess,is,not,a,predominantly,physical,sport,\"\",\"\",yet,neither,are,shooting,and,curling,-LRB-,which,\"\",\"\",in,fact,\"\",\"\",has,been,nicknamed,``,chess,on,ice,'',5,-RRB-,.}");
            add("{chess,be,not,a,predominantly,physical,sport,\"\",\"\",yet,neither,be,shooting,and,curling,-lrb-,which,\"\",\"\",in,fact,\"\",\"\",have,be,nickname,``,chess,on,ice,'',5,-rrb-,.}");
            add("{NN,VBZ,RB,DT,RB,JJ,NN,\"\",\"\",RB,DT,VBP,JJ,CC,NN,-LRB-,WDT,\"\",\"\",IN,NN,\"\",\"\",VBZ,VBN,VBN,``,NN,IN,NN,'',LS,-RRB-,.}");
            add("{O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,NUMBER,O,O}");
            add("{0,6,9,13,15,29,38,43,45,49,57,61,70,74,82,83,88,90,93,97,99,103,108,118,119,125,128,131,132,133,134}");
            add("{5,8,12,14,28,37,43,44,48,56,60,69,73,81,83,88,89,92,97,98,102,107,117,119,124,127,131,132,133,134,135}");
            add("[{\"\"dependent\"\": 7, \"\"dep\"\": \"\"ROOT\"\", \"\"governorGloss\"\": \"\"ROOT\"\", \"\"governor\"\": 0, \"\"dependentGloss\"\": \"\"sport\"\"}, {\"\"dependent\"\": 1, \"\"dep\"\": \"\"nsubj\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"Chess\"\"}, {\"\"dependent\"\": 2, \"\"dep\"\": \"\"cop\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"is\"\"}, {\"\"dependent\"\": 3, \"\"dep\"\": \"\"neg\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"not\"\"}, {\"\"dependent\"\": 4, \"\"dep\"\": \"\"det\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"a\"\"}, {\"\"dependent\"\": 5, \"\"dep\"\": \"\"advmod\"\", \"\"governorGloss\"\": \"\"physical\"\", \"\"governor\"\": 6, \"\"dependentGloss\"\": \"\"predominantly\"\"}, {\"\"dependent\"\": 6, \"\"dep\"\": \"\"amod\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"physical\"\"}, {\"\"dependent\"\": 9, \"\"dep\"\": \"\"advmod\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"yet\"\"}, {\"\"dependent\"\": 10, \"\"dep\"\": \"\"nsubj\"\", \"\"governorGloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentGloss\"\": \"\"neither\"\"}, {\"\"dependent\"\": 11, \"\"dep\"\": \"\"cop\"\", \"\"governorGloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentGloss\"\": \"\"are\"\"}, {\"\"dependent\"\": 12, \"\"dep\"\": \"\"parataxis\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"shooting\"\"}, {\"\"dependent\"\": 13, \"\"dep\"\": \"\"cc\"\", \"\"governorGloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentGloss\"\": \"\"and\"\"}, {\"\"dependent\"\": 14, \"\"dep\"\": \"\"parataxis\"\", \"\"governorGloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentGloss\"\": \"\"curling\"\"}, {\"\"dependent\"\": 14, \"\"dep\"\": \"\"conj:and\"\", \"\"governorGloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentGloss\"\": \"\"curling\"\"}, {\"\"dependent\"\": 16, \"\"dep\"\": \"\"nsubjpass\"\", \"\"governorGloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentGloss\"\": \"\"which\"\"}, {\"\"dependent\"\": 18, \"\"dep\"\": \"\"case\"\", \"\"governorGloss\"\": \"\"fact\"\", \"\"governor\"\": 19, \"\"dependentGloss\"\": \"\"in\"\"}, {\"\"dependent\"\": 19, \"\"dep\"\": \"\"nmod:in\"\", \"\"governorGloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentGloss\"\": \"\"fact\"\"}, {\"\"dependent\"\": 21, \"\"dep\"\": \"\"aux\"\", \"\"governorGloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentGloss\"\": \"\"has\"\"}, {\"\"dependent\"\": 22, \"\"dep\"\": \"\"auxpass\"\", \"\"governorGloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentGloss\"\": \"\"been\"\"}, {\"\"dependent\"\": 23, \"\"dep\"\": \"\"dep\"\", \"\"governorGloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentGloss\"\": \"\"nicknamed\"\"}, {\"\"dependent\"\": 25, \"\"dep\"\": \"\"dobj\"\", \"\"governorGloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentGloss\"\": \"\"chess\"\"}, {\"\"dependent\"\": 26, \"\"dep\"\": \"\"case\"\", \"\"governorGloss\"\": \"\"ice\"\", \"\"governor\"\": 27, \"\"dependentGloss\"\": \"\"on\"\"}, {\"\"dependent\"\": 27, \"\"dep\"\": \"\"nmod:on\"\", \"\"governorGloss\"\": \"\"chess\"\", \"\"governor\"\": 25, \"\"dependentGloss\"\": \"\"ice\"\"}, {\"\"dependent\"\": 29, \"\"dep\"\": \"\"amod\"\", \"\"governorGloss\"\": \"\"chess\"\", \"\"governor\"\": 25, \"\"dependentGloss\"\": \"\"5\"\"}]");
            add("Chess is not a predominantly physical sport, yet neither are shooting and curling (which, in fact, has been nicknamed “chess on ice”5).");
        }
    });
    TSVSentenceIterator it = new TSVSentenceIterator(entries.iterator(), new ArrayList<SentenceField>() {

        {
            add(SentenceField.ID);
            add(SentenceField.DOC_ID);
            add(SentenceField.SENTENCE_INDEX);
            add(SentenceField.WORDS);
            add(SentenceField.LEMMAS);
            add(SentenceField.POS_TAGS);
            add(SentenceField.NER_TAGS);
            add(SentenceField.DOC_CHAR_BEGIN);
            add(SentenceField.DOC_CHAR_END);
            add(SentenceField.DEPENDENCIES_BASIC);
            add(SentenceField.GLOSS);
        }
    });
    Sentence sentence = it.next();
    sentence.dependencyGraph();
    sentence.openieTriples();
}
Also used : ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) SentenceField(edu.stanford.nlp.util.TSVSentenceIterator.SentenceField) Sentence(edu.stanford.nlp.simple.Sentence) Test(org.junit.Test)

Example 3 with Sentence

use of edu.stanford.nlp.simple.Sentence in project Anserini by castorini.

the class PyseriniEntryPoint method getRankedPassages.

public List<String> getRankedPassages(String query, int numHits, int k) throws Exception {
    Map<String, Float> docScore = search(query, numHits);
    Map<String, Float> sentencesMap = new LinkedHashMap<>();
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    for (Map.Entry<String, Float> doc : docScore.entrySet()) {
        List<Sentence> sentences = indexUtils.getSentDocument(doc.getKey());
        for (Sentence thisSent : sentences) {
            // tokenize the sentences
            List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(thisSent.text())).tokenize();
            String answerTokens = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
            sentencesMap.put(answerTokens, doc.getValue());
        }
    }
    passageScorer = new IdfPassageScorer(indexDir, k);
    String queryTokens = tokenizerFactory.getTokenizer(new StringReader(query)).tokenize().stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
    passageScorer.score(query, sentencesMap);
    List<String> topSentences = new ArrayList<>();
    List<ScoredPassage> topPassages = passageScorer.extractTopPassages();
    for (ScoredPassage s : topPassages) {
        topSentences.add(s.getSentence() + "\t" + s.getScore());
    }
    return topSentences;
}
Also used : CoreLabelTokenFactory(edu.stanford.nlp.process.CoreLabelTokenFactory) IdfPassageScorer(io.anserini.qa.passage.IdfPassageScorer) CoreLabel(edu.stanford.nlp.ling.CoreLabel) StringReader(java.io.StringReader) ScoredPassage(io.anserini.qa.passage.ScoredPassage) Sentence(edu.stanford.nlp.simple.Sentence)

Example 4 with Sentence

use of edu.stanford.nlp.simple.Sentence in project Anserini by castorini.

the class RetrieveSentences method getRankedPassages.

public void getRankedPassages(Args args) throws Exception {
    Map<String, Float> scoredDocs = retrieveDocuments(args.query, args.hits);
    Map<String, Float> sentencesMap = new LinkedHashMap<>();
    IndexUtils util = new IndexUtils(args.index);
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    for (Map.Entry<String, Float> doc : scoredDocs.entrySet()) {
        List<Sentence> sentences = util.getSentDocument(doc.getKey());
        for (Sentence sent : sentences) {
            List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(sent.text())).tokenize();
            String answerTokens = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
            sentencesMap.put(answerTokens, doc.getValue());
        }
    }
    String queryTokens = tokenizerFactory.getTokenizer(new StringReader(args.query)).tokenize().stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
    scorer.score(queryTokens, sentencesMap);
    List<ScoredPassage> topPassages = scorer.extractTopPassages();
    for (ScoredPassage s : topPassages) {
        System.out.println(s.getSentence() + " " + s.getScore());
    }
}
Also used : CoreLabelTokenFactory(edu.stanford.nlp.process.CoreLabelTokenFactory) IndexUtils(io.anserini.index.IndexUtils) CoreLabel(edu.stanford.nlp.ling.CoreLabel) StringReader(java.io.StringReader) ScoredPassage(io.anserini.qa.passage.ScoredPassage) Sentence(edu.stanford.nlp.simple.Sentence)

Example 5 with Sentence

use of edu.stanford.nlp.simple.Sentence in project CoreNLP by stanfordnlp.

the class KBPStatisticalExtractor method surfaceFeatures.

@SuppressWarnings("UnusedParameters")
private static void surfaceFeatures(KBPInput input, Sentence simpleSentence, ClassicCounter<String> feats) {
    List<String> lemmaSpan = spanBetweenMentions(input, CoreLabel::lemma);
    List<String> nerSpan = spanBetweenMentions(input, CoreLabel::ner);
    List<String> posSpan = spanBetweenMentions(input, CoreLabel::tag);
    // Unigram features of the sentence
    List<CoreLabel> tokens = input.sentence.asCoreLabels(Sentence::lemmas, Sentence::nerTags);
    for (CoreLabel token : tokens) {
        indicator(feats, "sentence_unigram", token.lemma());
    }
    // Full lemma span ( -0.3 F1 )
    // if (lemmaSpan.size() <= 5) {
    // indicator(feats, "full_lemma_span", withMentionsPositioned(input, StringUtils.join(lemmaSpan, " ")));
    // }
    // Lemma n-grams
    String lastLemma = "_^_";
    for (String lemma : lemmaSpan) {
        indicator(feats, "lemma_bigram", withMentionsPositioned(input, lastLemma + " " + lemma));
        indicator(feats, "lemma_unigram", withMentionsPositioned(input, lemma));
        lastLemma = lemma;
    }
    indicator(feats, "lemma_bigram", withMentionsPositioned(input, lastLemma + " _$_"));
    // NER + lemma bi-grams
    for (int i = 0; i < lemmaSpan.size() - 1; ++i) {
        if (!"O".equals(nerSpan.get(i)) && "O".equals(nerSpan.get(i + 1)) && "IN".equals(posSpan.get(i + 1))) {
            indicator(feats, "ner/lemma_bigram", withMentionsPositioned(input, nerSpan.get(i) + " " + lemmaSpan.get(i + 1)));
        }
        if (!"O".equals(nerSpan.get(i + 1)) && "O".equals(nerSpan.get(i)) && "IN".equals(posSpan.get(i))) {
            indicator(feats, "ner/lemma_bigram", withMentionsPositioned(input, lemmaSpan.get(i) + " " + nerSpan.get(i + 1)));
        }
    }
    // Distance between mentions
    String distanceBucket = ">10";
    if (lemmaSpan.size() == 0) {
        distanceBucket = "0";
    } else if (lemmaSpan.size() <= 3) {
        distanceBucket = "<=3";
    } else if (lemmaSpan.size() <= 5) {
        distanceBucket = "<=5";
    } else if (lemmaSpan.size() <= 10) {
        distanceBucket = "<=10";
    } else if (lemmaSpan.size() <= 15) {
        distanceBucket = "<=15";
    }
    indicator(feats, "distance_between_entities_bucket", distanceBucket);
    // Punctuation features
    int numCommasInSpan = 0;
    int numQuotesInSpan = 0;
    int parenParity = 0;
    for (String lemma : lemmaSpan) {
        if (lemma.equals(",")) {
            numCommasInSpan += 1;
        }
        if (lemma.equals("\"") || lemma.equals("``") || lemma.equals("''")) {
            numQuotesInSpan += 1;
        }
        if (lemma.equals("(") || lemma.equals("-LRB-")) {
            parenParity += 1;
        }
        if (lemma.equals(")") || lemma.equals("-RRB-")) {
            parenParity -= 1;
        }
    }
    indicator(feats, "comma_parity", numCommasInSpan % 2 == 0 ? "even" : "odd");
    indicator(feats, "quote_parity", numQuotesInSpan % 2 == 0 ? "even" : "odd");
    indicator(feats, "paren_parity", "" + parenParity);
    // Is broken by entity
    Set<String> intercedingNERTags = nerSpan.stream().filter(ner -> !ner.equals("O")).collect(Collectors.toSet());
    if (!intercedingNERTags.isEmpty()) {
        indicator(feats, "has_interceding_ner", "t");
    }
    for (String ner : intercedingNERTags) {
        indicator(feats, "interceding_ner", ner);
    }
    // Left and right context
    List<CoreLabel> sentence = input.sentence.asCoreLabels(Sentence::nerTags);
    if (input.subjectSpan.start() == 0) {
        indicator(feats, "subj_left", "^");
    } else {
        indicator(feats, "subj_left", sentence.get(input.subjectSpan.start() - 1).lemma());
    }
    if (input.subjectSpan.end() == sentence.size()) {
        indicator(feats, "subj_right", "$");
    } else {
        indicator(feats, "subj_right", sentence.get(input.subjectSpan.end()).lemma());
    }
    if (input.objectSpan.start() == 0) {
        indicator(feats, "obj_left", "^");
    } else {
        indicator(feats, "obj_left", sentence.get(input.objectSpan.start() - 1).lemma());
    }
    if (input.objectSpan.end() == sentence.size()) {
        indicator(feats, "obj_right", "$");
    } else {
        indicator(feats, "obj_right", sentence.get(input.objectSpan.end()).lemma());
    }
    // Skip-word patterns
    if (lemmaSpan.size() == 1 && input.subjectSpan.isBefore(input.objectSpan)) {
        String left = input.subjectSpan.start() == 0 ? "^" : sentence.get(input.subjectSpan.start() - 1).lemma();
        indicator(feats, "X<subj>Y<obj>", left + "_" + lemmaSpan.get(0));
    }
}
Also used : edu.stanford.nlp.optimization(edu.stanford.nlp.optimization) CoreLabel(edu.stanford.nlp.ling.CoreLabel) java.util(java.util) Counters(edu.stanford.nlp.stats.Counters) IOUtils(edu.stanford.nlp.io.IOUtils) DefaultPaths(edu.stanford.nlp.pipeline.DefaultPaths) edu.stanford.nlp.util(edu.stanford.nlp.util) Redwood(edu.stanford.nlp.util.logging.Redwood) Util(edu.stanford.nlp.util.logging.Redwood.Util) Datum(edu.stanford.nlp.ling.Datum) Function(java.util.function.Function) Collectors(java.util.stream.Collectors) Span(edu.stanford.nlp.ie.machinereading.structure.Span) Counter(edu.stanford.nlp.stats.Counter) java.io(java.io) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) edu.stanford.nlp.classify(edu.stanford.nlp.classify) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) Sentence(edu.stanford.nlp.simple.Sentence) RedwoodConfiguration(edu.stanford.nlp.util.logging.RedwoodConfiguration) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) RVFDatum(edu.stanford.nlp.ling.RVFDatum) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Sentence(edu.stanford.nlp.simple.Sentence)

Aggregations

Sentence (edu.stanford.nlp.simple.Sentence)34 Test (org.junit.Test)20 AnalyzeParagragh (main.Analyze.AnalyzeParagragh)16 TableTuple (main.database.TableTuple)12 CoreLabel (edu.stanford.nlp.ling.CoreLabel)7 List (java.util.List)6 CoreLabelTokenFactory (edu.stanford.nlp.process.CoreLabelTokenFactory)4 StringReader (java.io.StringReader)4 Span (edu.stanford.nlp.ie.machinereading.structure.Span)3 CoreMap (edu.stanford.nlp.util.CoreMap)3 SentenceField (edu.stanford.nlp.util.TSVSentenceIterator.SentenceField)3 ScoredPassage (io.anserini.qa.passage.ScoredPassage)3 ArrayList (java.util.ArrayList)3 IndexUtils (io.anserini.index.IndexUtils)2 IdfPassageScorer (io.anserini.qa.passage.IdfPassageScorer)2 InteractiveTableTuple (main.database.InteractiveTableTuple)2 ReasonPair (main.database.ReasonPair)2 edu.stanford.nlp.classify (edu.stanford.nlp.classify)1 NERTag (edu.stanford.nlp.ie.KBPRelationExtractor.NERTag)1 IOUtils (edu.stanford.nlp.io.IOUtils)1