use of com.graphaware.nlp.dsl.request.PipelineSpecification in project neo4j-nlp-stanfordnlp by graphaware.
the class TextProcessorTest method testPipelineWithCustomStopwordsDoNotAddNERDateToTheWord.
@Test
public void testPipelineWithCustomStopwordsDoNotAddNERDateToTheWord() {
String text = "In addition to the dollar the coinage act officially established monetary units of mill or one-thousandth of a dollar (symbol ₥), cent or one-hundredth of a dollar (symbol ¢), dime or one-tenth of a dollar, and eagle or ten dollars, with prescribed weights and composition of gold, silver, or copper for each.";
PipelineSpecification specification = new PipelineSpecification("customx", StanfordTextProcessor.class.getName());
specification.addProcessingStep("dependency");
specification.setStopWords("start, starts");
textProcessor.createPipeline(specification);
AnnotatedText annotatedText = textProcessor.annotateText(text, "en", specification);
TestAnnotatedText test = new TestAnnotatedText(annotatedText);
test.assertTagWithLemma("the");
test.assertNotTag(newTag("the", Collections.singletonList("DATE"), Collections.emptyList()));
}
use of com.graphaware.nlp.dsl.request.PipelineSpecification in project neo4j-nlp-stanfordnlp by graphaware.
the class TextProcessorTest method testAddPipelineTakesStopwordsIntoAccount.
@Test
public void testAddPipelineTakesStopwordsIntoAccount() {
String text = "det, vad, eller, sin, efter, i, varje, sådan, de, ditt, han, dessa, vi, med, då, den, mig, denna, ingen, under, henne, sådant, du, hade, vilken,".replaceAll(",", "");
PipelineSpecification specification = new PipelineSpecification("customsw", StanfordTextProcessor.class.getName());
specification.getProcessingSteps().put("tokenize", true);
String stopwords = "sådan,själv, dig, från, vilkas, dem, ett, varit, varför, att, era, som";
specification.setStopWords("sådan,själv, dig, från, vilkas, dem, ett, varit, varför, att, era, som");
AnnotatedText annotatedText = textProcessor.annotateText(text, "en", specification);
List<String> blacklist = Arrays.asList(stopwords.split(","));
annotatedText.getTags().forEach(tag -> {
assertFalse(blacklist.contains(tag.getLemma()));
});
}
use of com.graphaware.nlp.dsl.request.PipelineSpecification in project neo4j-nlp-stanfordnlp by graphaware.
the class TextProcessorTest method testAnnotatedQuestionWithNoStopwords.
@Test
public void testAnnotatedQuestionWithNoStopwords() {
PipelineSpecification specification = new PipelineSpecification("question-no-sw", StanfordTextProcessor.class.getName());
specification.addProcessingStep("dependency");
specification.setStopWords("start, starts");
textProcessor.createPipeline(specification);
String text = "What is in front of the Notre Dame Main Building?";
AnnotatedText annotatedText = textProcessor.annotateText(text, "en", specification);
assertEquals(1, annotatedText.getSentences().size());
Sentence sentence = annotatedText.getSentences().get(0);
assertEquals("be", sentence.getTagOccurrence(5).getLemma());
}
use of com.graphaware.nlp.dsl.request.PipelineSpecification in project neo4j-nlp-stanfordnlp by graphaware.
the class TextProcessorTest method testAnnotatedTextWithPosition.
@Test
public void testAnnotatedTextWithPosition() {
PipelineSpecification specification = new PipelineSpecification("positionTest", StanfordTextProcessor.class.getName());
specification.addProcessingStep("truecase");
specification.addProcessingStep("sentiment");
specification.addProcessingStep("coref");
// specification.addProcessingStep("relations");
textProcessor.createPipeline(specification);
AnnotatedText annotateText = textProcessor.annotateText("On 8 May 2013, " + "one week before the Pakistani election, the third author, " + "in his keynote address at the Sentiment Analysis Symposium, " + "forecast the winner of the Pakistani election. The chart " + "in Figure 1 shows varying sentiment on the candidates for " + "prime minister of Pakistan in that election. The next day, " + "the BBC’s Owen Bennett Jones, reporting from Islamabad, wrote " + "an article titled “Pakistan Elections: Five Reasons Why the " + "Vote is Unpredictable,”1 in which he claimed that the election " + "was too close to call. It was not, and despite his being in Pakistan, " + "the outcome of the election was exactly as we predicted.", "en", specification);
assertEquals(4, annotateText.getSentences().size());
Sentence sentence1 = annotateText.getSentences().get(0);
assertEquals(15, sentence1.getTags().size());
assertNull(sentence1.getTagOccurrence(0));
assertEquals("8 May 2013", sentence1.getTagOccurrence(3).getLemma());
assertEquals("one week", sentence1.getTagOccurrence(15).getLemma());
assertEquals("before", sentence1.getTagOccurrence(24).getLemma());
assertEquals("third", sentence1.getTagOccurrence(59).getLemma());
assertEquals("sentiment", sentence1.getTagOccurrence(103).getLemma());
assertEquals("forecast", sentence1.getTagOccurrence(133).getLemma());
assertNull(sentence1.getTagOccurrence(184));
System.out.println(" >>> n_phrases = " + sentence1.getPhraseOccurrences().size());
// assertTrue(sentence1.getPhraseOccurrence(99).contains(new Phrase("the Sentiment Analysis Symposium")));
// assertTrue(sentence1.getPhraseOccurrence(103).contains(new Phrase("Sentiment")));
// assertTrue(sentence1.getPhraseOccurrence(113).contains(new Phrase("Analysis")));
// his(76)-> the third author(54)
// assertTrue(sentence1.getPhraseOccurrence(55).get(1).getContent().equalsIgnoreCase("the third author"));
Sentence sentence2 = annotateText.getSentences().get(1);
assertEquals("chart", sentence2.getTagOccurrence(184).getLemma());
assertEquals("figure", sentence2.getTagOccurrence(193).getLemma());
}
use of com.graphaware.nlp.dsl.request.PipelineSpecification in project neo4j-nlp-stanfordnlp by graphaware.
the class TextProcessorTest method testIssueWithBe.
@Test
public void testIssueWithBe() {
PipelineSpecification specification = new PipelineSpecification("issue-be", StanfordTextProcessor.class.getName());
specification.addProcessingStep("dependency");
specification.setStopWords("start, starts");
textProcessor.createPipeline(specification);
String text = "Unlike the Spanish milled dollar the U.S. dollar is based upon a decimal system of values.";
AnnotatedText annotatedText = textProcessor.annotateText(text, "en", specification);
assertEquals(1, annotatedText.getSentences().size());
Sentence sentence = annotatedText.getSentences().get(0);
assertEquals("be", sentence.getTagOccurrence(49).getLemma());
}
Aggregations