use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class QuantifiableEntityExtractorITest method createDocument.
protected static Annotation createDocument(String text) {
Annotation annotation = new Annotation(text);
pipeline.annotate(annotation);
return annotation;
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class SpanishTokenizerAnnotatorITest method testSpanish.
@Test
public void testSpanish() {
Annotation ann = new Annotation("Damelo");
Properties props = new Properties();
props.setProperty("annotators", "tokenize");
props.setProperty("tokenize.language", "es");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
pipeline.annotate(ann);
Iterator<String> it = spanishTokens.iterator();
for (CoreLabel word : ann.get(CoreAnnotations.TokensAnnotation.class)) {
assertEquals("Bung token in new CoreLabel usage", it.next(), word.get(CoreAnnotations.TextAnnotation.class));
}
assertFalse("Too few tokens in new CoreLabel usage", it.hasNext());
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class Util method annotate.
/**
* TODO(gabor) JavaDoc
*
* @param sentence
* @param pipeline
*/
public static void annotate(CoreMap sentence, AnnotationPipeline pipeline) {
Annotation ann = new Annotation(StringUtils.join(sentence.get(CoreAnnotations.TokensAnnotation.class), " "));
ann.set(CoreAnnotations.TokensAnnotation.class, sentence.get(CoreAnnotations.TokensAnnotation.class));
ann.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence));
pipeline.annotate(ann);
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class GetPatternsFromDataMultiClass method runPOSNERParseOnTokens.
public static Map<String, DataInstance> runPOSNERParseOnTokens(Map<String, DataInstance> sents, Properties propsoriginal) {
PatternFactory.PatternType type = PatternFactory.PatternType.valueOf(propsoriginal.getProperty(Flags.patternType));
Properties props = new Properties();
List<String> anns = new ArrayList<>();
anns.add("pos");
anns.add("lemma");
boolean useTargetParserParentRestriction = Boolean.parseBoolean(propsoriginal.getProperty(Flags.useTargetParserParentRestriction));
boolean useTargetNERRestriction = Boolean.parseBoolean(propsoriginal.getProperty(Flags.useTargetNERRestriction));
String posModelPath = props.getProperty(Flags.posModelPath);
String numThreads = propsoriginal.getProperty(Flags.numThreads);
if (useTargetParserParentRestriction) {
anns.add("parse");
} else if (type.equals(PatternFactory.PatternType.DEP))
anns.add("depparse");
if (useTargetNERRestriction) {
anns.add("ner");
}
props.setProperty("annotators", StringUtils.join(anns, ","));
props.setProperty("parse.maxlen", "80");
props.setProperty("nthreads", numThreads);
props.setProperty("threads", numThreads);
if (posModelPath != null) {
props.setProperty("pos.model", posModelPath);
}
StanfordCoreNLP pipeline = new StanfordCoreNLP(props, false);
Redwood.log(Redwood.DBG, "Annotating text");
for (Map.Entry<String, DataInstance> en : sents.entrySet()) {
List<CoreMap> temp = new ArrayList<>();
CoreMap s = new ArrayCoreMap();
s.set(CoreAnnotations.TokensAnnotation.class, en.getValue().getTokens());
temp.add(s);
Annotation doc = new Annotation(temp);
try {
pipeline.annotate(doc);
if (useTargetParserParentRestriction)
inferParentParseTag(s.get(TreeAnnotation.class));
} catch (Exception e) {
log.warn("Ignoring error: for sentence " + StringUtils.joinWords(en.getValue().getTokens(), " "));
log.warn(e);
}
}
Redwood.log(Redwood.DBG, "Done annotating text");
return sents;
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class GetPatternsFromDataMultiClass method tokenize.
public static int tokenize(Iterator<String> textReader, String posModelPath, boolean lowercase, boolean useTargetNERRestriction, String sentIDPrefix, boolean useTargetParserParentRestriction, String numThreads, boolean batchProcessSents, int numMaxSentencesPerBatchFile, File saveSentencesSerDirFile, Map<String, DataInstance> sents, int numFilesTillNow, PatternFactory.PatternType type) throws InterruptedException, ExecutionException, IOException {
if (pipeline == null) {
Properties props = new Properties();
List<String> anns = new ArrayList<>();
anns.add("tokenize");
anns.add("ssplit");
anns.add("pos");
anns.add("lemma");
if (useTargetParserParentRestriction) {
anns.add("parse");
}
if (type.equals(PatternFactory.PatternType.DEP))
anns.add("depparse");
if (useTargetNERRestriction) {
anns.add("ner");
}
props.setProperty("annotators", StringUtils.join(anns, ","));
props.setProperty("parse.maxlen", "80");
if (numThreads != null)
props.setProperty("threads", numThreads);
props.setProperty("tokenize.options", "ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
if (posModelPath != null) {
props.setProperty("pos.model", posModelPath);
}
pipeline = new StanfordCoreNLP(props);
}
String text = "";
int numLines = 0;
while (textReader.hasNext()) {
String line = textReader.next();
numLines++;
if (batchProcessSents && numLines > numMaxSentencesPerBatchFile) {
break;
}
if (lowercase)
line = line.toLowerCase();
text += line + "\n";
}
Annotation doc = new Annotation(text);
pipeline.annotate(doc);
int i = -1;
for (CoreMap s : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
i++;
if (useTargetParserParentRestriction)
inferParentParseTag(s.get(TreeAnnotation.class));
DataInstance d = DataInstance.getNewInstance(type, s);
sents.put(sentIDPrefix + i, d);
// if (batchProcessSents && sents.size() >= numMaxSentencesPerBatchFile) {
// numFilesTillNow++;
// File file = new File(saveSentencesSerDirFile + "/sents_" + numFilesTillNow);
// IOUtils.writeObjectToFile(sents, file);
// sents = new HashMap<String, DataInstance>();
// Data.sentsFiles.add(file);
// }
}
Redwood.log(Redwood.DBG, "Done annotating text with " + i + " sentences");
if (sents.size() > 0 && batchProcessSents) {
numFilesTillNow++;
File file = new File(saveSentencesSerDirFile + "/sents_" + numFilesTillNow);
IOUtils.writeObjectToFile(sents, file);
Data.sentsFiles.add(file);
for (String sentid : sents.keySet()) {
assert !Data.sentId2File.containsKey(sentid) : "Data.sentId2File already contains " + sentid + ". Make sure sentIds are unique!";
Data.sentId2File.put(sentid, file);
}
sents.clear();
}
// not lugging around sents if batch processing
if (batchProcessSents)
sents = null;
return numFilesTillNow;
}
Aggregations