use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.
the class MultilingualTokenizeTextToColumn method processFile.
/**
* given an input containing plain text, tokenize and write to named output file.
*
* @param corpus name of corpus
* @param in file to tokenize
* @param out output file for tokenized text
*/
public void processFile(String corpus, File in, String out) throws IOException {
if (!in.exists())
throw new IOException("File '" + in.getAbsolutePath() + "' doesn't exist.");
if (!in.isFile())
throw new IOException("File '" + in.getAbsolutePath() + "' exists but is not a file.");
//Charset.defaultCharset().name());//
Scanner scanner = new Scanner(new FileInputStream(in), StandardCharsets.UTF_8.name());
StringBuilder sb = new StringBuilder();
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
sb.append(line).append("\n");
}
scanner.close();
String str = sb.toString();
TextAnnotation ta = taBldr.createTextAnnotation(corpus, in.getName(), str);
View sents = ta.getView(ViewNames.SENTENCE);
logger.info("processing file '{}'; input length is {}", in.getAbsolutePath(), str.length());
// System.err.println("processing file '" + in.getAbsolutePath() + "'..." + " input length: " + str.length());
List<Constituent> toks = ta.getView(ViewNames.TOKENS).getConstituents();
// List<String> outputs = new ArrayList<>();
StringBuilder bldr = new StringBuilder();
for (Constituent sent : sents) {
int index = 1;
for (Constituent tok : toks) {
if (tok.getStartCharOffset() >= sent.getStartCharOffset() && tok.getEndCharOffset() <= sent.getEndCharOffset()) {
bldr.append(Integer.toString(index++)).append("\t").append(tok.getSurfaceForm()).append("\t").append(tok.getStartCharOffset()).append("\t").append(tok.getEndCharOffset()).append(System.lineSeparator());
}
}
// empty line to separate sentences
bldr.append(System.lineSeparator());
}
System.err.println("output length: " + bldr.toString().length());
// LineIO.write(out, outputs);
try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(out)), StandardCharsets.UTF_8.name())) {
writer.write(bldr.toString());
} catch (IOException e) {
logger.error("Can't write to file {}: {}", out, e.getMessage());
e.printStackTrace();
throw e;
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.
the class StanfordOpenIEHandler method addView.
@Override
protected void addView(TextAnnotation ta) throws AnnotatorException {
Annotation document = new Annotation(ta.text);
pipeline.annotate(document);
SpanLabelView vu = new SpanLabelView(viewName, ta);
for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
Collection<RelationTriple> triples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
for (RelationTriple triple : triples) {
Constituent subject = getConstituent(triple.subjectGloss(), triple.subjectTokenSpan(), sentence, ta);
subject.addAttribute("subjectGloss", triple.subjectGloss());
subject.addAttribute("subjectLemmaGloss", triple.subjectLemmaGloss());
subject.addAttribute("subjectLink", triple.subjectLink());
Constituent object = getConstituent(triple.objectGloss(), triple.objectTokenSpan(), sentence, ta);
object.addAttribute("objectGloss", triple.objectGloss());
object.addAttribute("objectLemmaGloss", triple.objectLemmaGloss());
object.addAttribute("objectLink", triple.objectLink());
Constituent relation = getConstituent(triple.relationGloss(), triple.relationTokenSpan(), sentence, ta);
relation.addAttribute("relationGloss", triple.relationGloss());
relation.addAttribute("relationLemmaGloss", triple.relationLemmaGloss());
Relation subj = new Relation("subj", relation, subject, triple.confidence);
Relation obj = new Relation("obj", relation, object, triple.confidence);
vu.addRelation(subj);
vu.addRelation(obj);
vu.addConstituent(subject);
vu.addConstituent(object);
vu.addConstituent(relation);
}
}
ta.addView(viewName, vu);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.
the class StanfordTrueCaseHandler method addView.
@Override
protected void addView(TextAnnotation ta) throws AnnotatorException {
Annotation document = new Annotation(ta.text);
pipeline.annotate(document);
TokenLabelView vu = new TokenLabelView(viewName, ta);
for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
String trueCase = token.get(CoreAnnotations.TrueCaseTextAnnotation.class);
int beginCharOffsetS = token.beginPosition();
int endCharOffset = token.endPosition() - 1;
List<Constituent> overlappingCons = ta.getView(ViewNames.TOKENS).getConstituentsOverlappingCharSpan(beginCharOffsetS, endCharOffset);
int endIndex = overlappingCons.stream().max(Comparator.comparing(Constituent::getEndSpan)).get().getEndSpan();
Constituent c = new Constituent(trueCase, viewName, ta, endIndex - 1, endIndex);
vu.addConstituent(c);
}
}
ta.addView(viewName, vu);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.
the class StanfordCorefHandler method addView.
@Override
protected void addView(TextAnnotation ta) throws AnnotatorException {
Annotation document = new Annotation(ta.text);
pipeline.annotate(document);
CoreferenceView vu = new CoreferenceView(viewName, ta);
Map corefChain = document.get(CorefCoreAnnotations.CorefChainAnnotation.class);
for (Object key : corefChain.keySet()) {
CorefChain chain = (CorefChain) corefChain.get(key);
Constituent representative = createConstituentGivenMention(document, chain, chain.getRepresentativeMention(), ta);
List<Constituent> consList = new ArrayList<>();
for (CorefChain.CorefMention m : chain.getMentionsInTextualOrder()) {
consList.add(createConstituentGivenMention(document, chain, m, ta));
}
// remove the representative itself
consList.remove(representative);
vu.addCorefEdges(representative, consList);
}
ta.addView(viewName, vu);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.
the class IllinoisLemmatizer method createLemmaView.
/**
* create a Lemma view in the TextAnnotation argument, and return a reference to that View.
*/
public View createLemmaView(TextAnnotation inputTa) throws IOException {
String[] toks = inputTa.getTokens();
TokenLabelView lemmaView = new TokenLabelView(ViewNames.LEMMA, NAME, inputTa, 1.0);
for (int i = 0; i < toks.length; ++i) {
String lemma = getLemma(inputTa, i);
Constituent lemmaConstituent = new Constituent(lemma, ViewNames.LEMMA, inputTa, i, i + 1);
lemmaView.addConstituent(lemmaConstituent);
}
inputTa.addView(ViewNames.LEMMA, lemmaView);
return lemmaView;
}
Aggregations