Search in sources :

Example 56 with Constituent

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.

the class MultilingualTokenizeTextToColumn method processFile.

/**
     * given an input containing plain text, tokenize and write to named output file.
     *
     * @param corpus name of corpus
     * @param in file to tokenize
     * @param out output file for tokenized text
     */
public void processFile(String corpus, File in, String out) throws IOException {
    if (!in.exists())
        throw new IOException("File '" + in.getAbsolutePath() + "' doesn't exist.");
    if (!in.isFile())
        throw new IOException("File '" + in.getAbsolutePath() + "' exists but is not a file.");
    //Charset.defaultCharset().name());//
    Scanner scanner = new Scanner(new FileInputStream(in), StandardCharsets.UTF_8.name());
    StringBuilder sb = new StringBuilder();
    while (scanner.hasNextLine()) {
        String line = scanner.nextLine();
        sb.append(line).append("\n");
    }
    scanner.close();
    String str = sb.toString();
    TextAnnotation ta = taBldr.createTextAnnotation(corpus, in.getName(), str);
    View sents = ta.getView(ViewNames.SENTENCE);
    logger.info("processing file '{}'; input length is {}", in.getAbsolutePath(), str.length());
    //        System.err.println("processing file '" + in.getAbsolutePath() + "'..." + " input length: " + str.length());
    List<Constituent> toks = ta.getView(ViewNames.TOKENS).getConstituents();
    //        List<String> outputs = new ArrayList<>();
    StringBuilder bldr = new StringBuilder();
    for (Constituent sent : sents) {
        int index = 1;
        for (Constituent tok : toks) {
            if (tok.getStartCharOffset() >= sent.getStartCharOffset() && tok.getEndCharOffset() <= sent.getEndCharOffset()) {
                bldr.append(Integer.toString(index++)).append("\t").append(tok.getSurfaceForm()).append("\t").append(tok.getStartCharOffset()).append("\t").append(tok.getEndCharOffset()).append(System.lineSeparator());
            }
        }
        // empty line to separate sentences
        bldr.append(System.lineSeparator());
    }
    System.err.println("output length: " + bldr.toString().length());
    //        LineIO.write(out, outputs);
    try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(out)), StandardCharsets.UTF_8.name())) {
        writer.write(bldr.toString());
    } catch (IOException e) {
        logger.error("Can't write to file {}: {}", out, e.getMessage());
        e.printStackTrace();
        throw e;
    }
}
Also used : Scanner(java.util.Scanner) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 57 with Constituent

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.

the class StanfordOpenIEHandler method addView.

@Override
protected void addView(TextAnnotation ta) throws AnnotatorException {
    Annotation document = new Annotation(ta.text);
    pipeline.annotate(document);
    SpanLabelView vu = new SpanLabelView(viewName, ta);
    for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
        Collection<RelationTriple> triples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
        for (RelationTriple triple : triples) {
            Constituent subject = getConstituent(triple.subjectGloss(), triple.subjectTokenSpan(), sentence, ta);
            subject.addAttribute("subjectGloss", triple.subjectGloss());
            subject.addAttribute("subjectLemmaGloss", triple.subjectLemmaGloss());
            subject.addAttribute("subjectLink", triple.subjectLink());
            Constituent object = getConstituent(triple.objectGloss(), triple.objectTokenSpan(), sentence, ta);
            object.addAttribute("objectGloss", triple.objectGloss());
            object.addAttribute("objectLemmaGloss", triple.objectLemmaGloss());
            object.addAttribute("objectLink", triple.objectLink());
            Constituent relation = getConstituent(triple.relationGloss(), triple.relationTokenSpan(), sentence, ta);
            relation.addAttribute("relationGloss", triple.relationGloss());
            relation.addAttribute("relationLemmaGloss", triple.relationLemmaGloss());
            Relation subj = new Relation("subj", relation, subject, triple.confidence);
            Relation obj = new Relation("obj", relation, object, triple.confidence);
            vu.addRelation(subj);
            vu.addRelation(obj);
            vu.addConstituent(subject);
            vu.addConstituent(object);
            vu.addConstituent(relation);
        }
    }
    ta.addView(viewName, vu);
}
Also used : Relation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Relation) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) NaturalLogicAnnotations(edu.stanford.nlp.naturalli.NaturalLogicAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Annotation(edu.stanford.nlp.pipeline.Annotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 58 with Constituent

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.

the class StanfordTrueCaseHandler method addView.

@Override
protected void addView(TextAnnotation ta) throws AnnotatorException {
    Annotation document = new Annotation(ta.text);
    pipeline.annotate(document);
    TokenLabelView vu = new TokenLabelView(viewName, ta);
    for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
        for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
            String trueCase = token.get(CoreAnnotations.TrueCaseTextAnnotation.class);
            int beginCharOffsetS = token.beginPosition();
            int endCharOffset = token.endPosition() - 1;
            List<Constituent> overlappingCons = ta.getView(ViewNames.TOKENS).getConstituentsOverlappingCharSpan(beginCharOffsetS, endCharOffset);
            int endIndex = overlappingCons.stream().max(Comparator.comparing(Constituent::getEndSpan)).get().getEndSpan();
            Constituent c = new Constituent(trueCase, viewName, ta, endIndex - 1, endIndex);
            vu.addConstituent(c);
        }
    }
    ta.addView(viewName, vu);
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) TokenLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TokenLabelView) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Annotation(edu.stanford.nlp.pipeline.Annotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 59 with Constituent

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.

the class StanfordCorefHandler method addView.

@Override
protected void addView(TextAnnotation ta) throws AnnotatorException {
    Annotation document = new Annotation(ta.text);
    pipeline.annotate(document);
    CoreferenceView vu = new CoreferenceView(viewName, ta);
    Map corefChain = document.get(CorefCoreAnnotations.CorefChainAnnotation.class);
    for (Object key : corefChain.keySet()) {
        CorefChain chain = (CorefChain) corefChain.get(key);
        Constituent representative = createConstituentGivenMention(document, chain, chain.getRepresentativeMention(), ta);
        List<Constituent> consList = new ArrayList<>();
        for (CorefChain.CorefMention m : chain.getMentionsInTextualOrder()) {
            consList.add(createConstituentGivenMention(document, chain, m, ta));
        }
        // remove the representative itself
        consList.remove(representative);
        vu.addCorefEdges(representative, consList);
    }
    ta.addView(viewName, vu);
}
Also used : CoreferenceView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.CoreferenceView) CorefChain(edu.stanford.nlp.coref.data.CorefChain) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Annotation(edu.stanford.nlp.pipeline.Annotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 60 with Constituent

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.

the class IllinoisLemmatizer method createLemmaView.

/**
     * create a Lemma view in the TextAnnotation argument, and return a reference to that View.
     */
public View createLemmaView(TextAnnotation inputTa) throws IOException {
    String[] toks = inputTa.getTokens();
    TokenLabelView lemmaView = new TokenLabelView(ViewNames.LEMMA, NAME, inputTa, 1.0);
    for (int i = 0; i < toks.length; ++i) {
        String lemma = getLemma(inputTa, i);
        Constituent lemmaConstituent = new Constituent(lemma, ViewNames.LEMMA, inputTa, i, i + 1);
        lemmaView.addConstituent(lemmaConstituent);
    }
    inputTa.addView(ViewNames.LEMMA, lemmaView);
    return lemmaView;
}
Also used : TokenLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TokenLabelView) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Aggregations

Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)176 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)95 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)51 Feature (edu.illinois.cs.cogcomp.edison.features.Feature)44 Test (org.junit.Test)39 ArrayList (java.util.ArrayList)29 Relation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Relation)25 EdisonException (edu.illinois.cs.cogcomp.edison.utilities.EdisonException)24 LinkedHashSet (java.util.LinkedHashSet)22 TreeView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TreeView)20 DiscreteFeature (edu.illinois.cs.cogcomp.edison.features.DiscreteFeature)20 FeatureExtractor (edu.illinois.cs.cogcomp.edison.features.FeatureExtractor)17 ProjectedPath (edu.illinois.cs.cogcomp.edison.features.lrec.ProjectedPath)16 FeatureManifest (edu.illinois.cs.cogcomp.edison.features.manifest.FeatureManifest)16 FileInputStream (java.io.FileInputStream)16 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)14 PredicateArgumentView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.PredicateArgumentView)13 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)12 HashSet (java.util.HashSet)12 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)11