use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class WikidictAnnotator method main.
/**
* A debugging method to try entity linking sentences from the console.
* @throws IOException
*/
public static void main(String[] args) throws IOException {
Properties props = StringUtils.argsToProperties(args);
props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,entitymentions,entitylink");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
IOUtils.console("sentence> ", line -> {
Annotation ann = new Annotation(line);
pipeline.annotate(ann);
List<CoreLabel> tokens = ann.get(CoreAnnotations.SentencesAnnotation.class).get(0).get(CoreAnnotations.TokensAnnotation.class);
System.err.println(StringUtils.join(tokens.stream().map(x -> x.get(CoreAnnotations.WikipediaEntityAnnotation.class)), " "));
});
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class DocumentPreprocessor method main.
/**
* A simple, deterministic sentence-splitter. This method only supports the English
* tokenizer, so for other languages you should run the tokenizer first and then
* run this sentence splitter with the "-whitespaceTokenization" option.
*
* @param args Command-line arguments
*/
public static void main(String[] args) throws IOException {
final Properties options = StringUtils.argsToProperties(args, argOptionDefs());
if (options.containsKey("help")) {
log.info(usage());
return;
}
// Command-line flags
String encoding = options.getProperty("encoding", "utf-8");
boolean printSentenceLengths = PropertiesUtils.getBool(options, "printSentenceLengths", false);
String xmlElementDelimiter = options.getProperty("xml", null);
DocType docType = xmlElementDelimiter == null ? DocType.Plain : DocType.XML;
String sentenceDelimiter = options.containsKey("noTokenization") ? System.getProperty("line.separator") : null;
String tagDelimiter = options.getProperty("tag", null);
String[] sentenceDelims = null;
// Setup the TokenizerFactory
int numFactoryFlags = 0;
boolean suppressEscaping = options.containsKey("suppressEscaping");
if (suppressEscaping)
numFactoryFlags += 1;
boolean customTokenizer = options.containsKey("tokenizerOptions");
if (customTokenizer)
numFactoryFlags += 1;
boolean printOriginalText = options.containsKey("printOriginalText");
if (printOriginalText)
numFactoryFlags += 1;
boolean whitespaceTokenization = options.containsKey("whitespaceTokenization");
if (whitespaceTokenization)
numFactoryFlags += 1;
if (numFactoryFlags > 1) {
log.info("Only one tokenizer flag allowed at a time: ");
log.info(" -suppressEscaping, -tokenizerOptions, -printOriginalText, -whitespaceTokenization");
return;
}
TokenizerFactory<? extends HasWord> tf = null;
if (suppressEscaping) {
tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "ptb3Escaping=false");
} else if (customTokenizer) {
tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), options.getProperty("tokenizerOptions"));
} else if (printOriginalText) {
tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");
} else if (whitespaceTokenization) {
List<String> whitespaceDelims = new ArrayList<>(Arrays.asList(DocumentPreprocessor.DEFAULT_SENTENCE_DELIMS));
whitespaceDelims.add(WhitespaceLexer.NEWLINE);
sentenceDelims = whitespaceDelims.toArray(new String[whitespaceDelims.size()]);
} else {
tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
}
String fileList = options.getProperty("", null);
String[] files = fileList == null ? new String[1] : fileList.split("\\s+");
int numSents = 0;
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, encoding), true);
for (String file : files) {
DocumentPreprocessor docPreprocessor;
if (file == null || file.isEmpty()) {
docPreprocessor = new DocumentPreprocessor(new InputStreamReader(System.in, encoding));
} else {
docPreprocessor = new DocumentPreprocessor(file, docType, encoding);
}
if (docType == DocType.XML) {
docPreprocessor.setElementDelimiter(xmlElementDelimiter);
}
docPreprocessor.setTokenizerFactory(tf);
if (sentenceDelimiter != null) {
docPreprocessor.setSentenceDelimiter(sentenceDelimiter);
}
if (tagDelimiter != null) {
docPreprocessor.setTagDelimiter(tagDelimiter);
}
if (sentenceDelims != null) {
docPreprocessor.setSentenceFinalPuncWords(sentenceDelims);
}
for (List<HasWord> sentence : docPreprocessor) {
numSents++;
if (printSentenceLengths) {
System.err.printf("Length: %d%n", sentence.size());
}
boolean printSpace = false;
for (HasWord word : sentence) {
if (printOriginalText) {
CoreLabel cl = (CoreLabel) word;
if (!printSpace) {
pw.print(cl.get(CoreAnnotations.BeforeAnnotation.class));
printSpace = true;
}
pw.print(cl.get(CoreAnnotations.OriginalTextAnnotation.class));
pw.print(cl.get(CoreAnnotations.AfterAnnotation.class));
} else {
if (printSpace)
pw.print(" ");
printSpace = true;
pw.print(word.word());
}
}
pw.println();
}
}
pw.close();
System.err.printf("Read in %d sentences.%n", numSents);
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class Sighan2005DocumentReaderAndWriter method printLattice.
@Override
public void printLattice(DFSA<String, Integer> tagLattice, List<CoreLabel> doc, PrintWriter out) {
CoreLabel[] docArray = doc.toArray(new CoreLabel[doc.size()]);
// Create answer lattice:
MutableInteger nodeId = new MutableInteger(0);
DFSA<String, Integer> answerLattice = new DFSA<>(null);
DFSAState<String, Integer> aInitState = new DFSAState<>(nodeId.intValue(), answerLattice);
answerLattice.setInitialState(aInitState);
Map<DFSAState<String, Integer>, DFSAState<String, Integer>> stateLinks = Generics.newHashMap();
// Convert binary lattice into word lattice:
tagLatticeToAnswerLattice(tagLattice.initialState(), aInitState, new StringBuilder(""), nodeId, 0, 0.0, stateLinks, answerLattice, docArray);
try {
answerLattice.printAttFsmFormat(out);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class Sighan2005DocumentReaderAndWriter method tagLatticeToAnswerLattice.
/**
* Recursively builds an answer lattice (Chinese words) from a Viterbi search graph
* of binary predictions. This function does a limited amount of post-processing:
* preserve white spaces of the input, and not segment between two latin characters or
* between two digits. Consequently, the probabilities of all paths in answerLattice
* may not sum to 1 (they do sum to 1 if no post processing applies).
*
* @param tSource Current node in Viterbi search graph.
* @param aSource Current node in answer lattice.
* @param answer Partial word starting at aSource.
* @param nodeId Currently unused node identifier for answer graph.
* @param pos Current position in docArray.
* @param cost Current cost of answer.
* @param stateLinks Maps nodes of the search graph to nodes in answer lattice
* (when paths of the search graph are recombined, paths of the answer lattice should be
* recombined as well, if at word boundary).
*/
private void tagLatticeToAnswerLattice(DFSAState<String, Integer> tSource, DFSAState<String, Integer> aSource, StringBuilder answer, MutableInteger nodeId, int pos, double cost, Map<DFSAState<String, Integer>, DFSAState<String, Integer>> stateLinks, DFSA<String, Integer> answerLattice, CoreLabel[] docArray) {
// Add "1" prediction after the end of the sentence, if applicable:
if (tSource.isAccepting() && tSource.continuingInputs().isEmpty()) {
tSource.addTransition(new DFSATransition<>("", tSource, new DFSAState<>(-1, null), "1", "", 0));
}
// Get current label, character, and prediction:
CoreLabel curLabel = (pos < docArray.length) ? docArray[pos] : null;
String curChr = null, origSpace = null;
if (curLabel != null) {
curChr = curLabel.get(CoreAnnotations.OriginalCharAnnotation.class);
assert (curChr.length() == 1);
origSpace = curLabel.get(CoreAnnotations.SpaceBeforeAnnotation.class);
}
// Get set of successors in search graph:
Set<String> inputs = tSource.continuingInputs();
// Only keep most probable transition out of initial state:
String answerConstraint = null;
if (pos == 0) {
double minCost = Double.POSITIVE_INFINITY;
// DFSATransition<String, Integer> bestTransition = null;
for (String predictSpace : inputs) {
DFSATransition<String, Integer> transition = tSource.transition(predictSpace);
double transitionCost = transition.score();
if (transitionCost < minCost) {
if (predictSpace != null) {
logger.info(String.format("mincost (%s): %e -> %e%n", predictSpace, minCost, transitionCost));
minCost = transitionCost;
answerConstraint = predictSpace;
}
}
}
}
// Follow along each transition:
for (String predictSpace : inputs) {
DFSATransition<String, Integer> transition = tSource.transition(predictSpace);
DFSAState<String, Integer> tDest = transition.target();
DFSAState<String, Integer> newASource = aSource;
//logger.info(String.format("tsource=%s tdest=%s asource=%s pos=%d predictSpace=%s%n", tSource, tDest, newASource, pos, predictSpace));
StringBuilder newAnswer = new StringBuilder(answer.toString());
int answerLen = newAnswer.length();
String prevChr = (answerLen > 0) ? newAnswer.substring(answerLen - 1) : null;
double newCost = cost;
// Ignore paths starting with zero:
if (answerConstraint != null && !answerConstraint.equals(predictSpace)) {
logger.info(String.format("Skipping transition %s at pos 0.%n", predictSpace));
continue;
}
// Ignore paths not consistent with input segmentation:
if (flags.keepAllWhitespaces && "0".equals(predictSpace) && "1".equals(origSpace)) {
logger.info(String.format("Skipping non-boundary at pos %d, since space in the input.%n", pos));
continue;
}
// (unless already present in original input)
if ("1".equals(predictSpace) && "0".equals(origSpace) && prevChr != null && curChr != null) {
char p = prevChr.charAt(0), c = curChr.charAt(0);
if (ChineseStringUtils.isLetterASCII(p) && ChineseStringUtils.isLetterASCII(c)) {
logger.info(String.format("Not hypothesizing a boundary at pos %d, since between two ASCII letters (%s and %s).%n", pos, prevChr, curChr));
continue;
}
if (ChineseUtils.isNumber(p) && ChineseUtils.isNumber(c)) {
logger.info(String.format("Not hypothesizing a boundary at pos %d, since between two numeral characters (%s and %s).%n", pos, prevChr, curChr));
continue;
}
}
// If predictSpace==1, create a new transition in answer search graph:
if ("1".equals(predictSpace)) {
if (newAnswer.toString().length() > 0) {
// If answer destination node visited before, create a new edge and leave:
if (stateLinks.containsKey(tSource)) {
DFSAState<String, Integer> aDest = stateLinks.get(tSource);
newASource.addTransition(new DFSATransition<>("", newASource, aDest, newAnswer.toString(), "", newCost));
//logger.info(String.format("new transition: asource=%s adest=%s edge=%s%n", newASource, aDest, newAnswer));
continue;
}
// If answer destination node not visited before, create it + new edge:
nodeId.incValue(1);
DFSAState<String, Integer> aDest = new DFSAState<>(nodeId.intValue(), answerLattice, 0.0);
stateLinks.put(tSource, aDest);
newASource.addTransition(new DFSATransition<>("", newASource, aDest, newAnswer.toString(), "", newCost));
// Reached an accepting state:
if (tSource.isAccepting()) {
aDest.setAccepting(true);
continue;
}
// Start new answer edge:
newASource = aDest;
newAnswer = new StringBuilder();
newCost = 0.0;
}
}
assert (curChr != null);
newAnswer.append(curChr);
newCost += transition.score();
if (newCost < flags.searchGraphPrune || ChineseStringUtils.isLetterASCII(curChr.charAt(0)))
tagLatticeToAnswerLattice(tDest, newASource, newAnswer, nodeId, pos + 1, newCost, stateLinks, answerLattice, docArray);
}
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class RulesTest method testMentionMatchesSpeakerAnnotation.
public void testMentionMatchesSpeakerAnnotation() {
Mention g1 = new Mention(0, 0, 0, null);
Mention m1 = new Mention(0, 0, 0, null);
Mention m2 = new Mention(0, 0, 0, null);
Mention m3 = new Mention(0, 0, 0, null);
Mention m4 = new Mention(0, 0, 0, null);
Mention m5 = new Mention(0, 0, 0, null);
Mention m6 = new Mention(0, 0, 0, null);
Mention m7 = new Mention(0, 0, 0, null);
Mention m8 = new Mention(0, 0, 0, null);
Mention g2 = new Mention(0, 0, 0, null);
Mention g3 = new Mention(0, 0, 0, null);
Mention g4 = new Mention(0, 0, 0, null);
g1.headWord = new CoreLabel();
g1.headWord.set(CoreAnnotations.SpeakerAnnotation.class, "john abraham bauer");
m1.headString = "john";
m2.headString = "bauer";
m3.headString = "foo";
m4.headString = "abraham";
m5.headString = "braham";
m6.headString = "zabraham";
m7.headString = "abraha";
m8.headString = "abrahamz";
g2.headWord = new CoreLabel();
g2.headWord.set(CoreAnnotations.SpeakerAnnotation.class, "john");
g3.headWord = new CoreLabel();
g3.headWord.set(CoreAnnotations.SpeakerAnnotation.class, "joh");
g4.headWord = new CoreLabel();
g4.headWord.set(CoreAnnotations.SpeakerAnnotation.class, "johnz");
assertTrue(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m1));
assertTrue(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m2));
assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m3));
assertTrue(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m4));
assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m5));
assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m6));
assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m7));
assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m8));
assertTrue(Rules.antecedentMatchesMentionSpeakerAnnotation(g2, m1));
assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(g3, m1));
assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(g4, m1));
// not symmetrical
// also, shouldn't blow up if the annotation isn't set
assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(m1, g1));
}
Aggregations