Search in sources :

Example 21 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class MascReader method main.

/**
 * Read sections of corpus into TextAnnotations, write out TextAnnotations in json format.
 * Specify MASC root dir of written files, e.g. /home/mssammon/work/data/masc-ccg/written/
 * @param args
 */
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage: " + NAME + " mascCorpusDir outDir");
        System.exit(-1);
    }
    String corpusDir = args[0];
    String outDirGold = args[1];
    String outDirPred = outDirGold + "_PRED";
    Properties props = new Properties();
    props.setProperty(CorpusReaderConfigurator.CORPUS_DIRECTORY.key, corpusDir);
    props.setProperty(CorpusReaderConfigurator.SOURCE_DIRECTORY.key, corpusDir);
    IOUtils.mkdir(outDirGold);
    IOUtils.mkdir(outDirPred);
    ResourceManager rm = new ResourceManager(props);
    MascReader reader = null;
    try {
        reader = new MascReader(rm);
    } catch (Exception e) {
        e.printStackTrace();
        System.exit(-1);
    }
    TextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, false));
    int numGoldTokCorrect = 0;
    int numGoldTokTotal = 0;
    int numGoldSentCorrect = 0;
    int numGoldSentTotal = 0;
    while (reader.hasNext()) {
        TextAnnotation goldTa = reader.next();
        String text = goldTa.getText();
        // Tokenizer.Tokenization tknz = tokenizer.tokenizeTextSpan(text);
        TextAnnotation predTa = taBldr.createTextAnnotation(goldTa.getCorpusId() + "_PREDICTED", goldTa.getId(), text);
        IntPair[] goldTokCharOffsets = getCharacterOffsets(goldTa.getView(ViewNames.TOKENS));
        numGoldTokTotal += goldTokCharOffsets.length;
        numGoldTokCorrect += countCorrectSpans(predTa.getView(ViewNames.TOKENS), goldTokCharOffsets);
        IntPair[] goldSentCharOffsets = getCharacterOffsets(goldTa.getView(ViewNames.SENTENCE));
        numGoldSentTotal += goldSentCharOffsets.length;
        numGoldSentCorrect += countCorrectSpans(predTa.getView(ViewNames.SENTENCE), goldSentCharOffsets);
        String taJson = SerializationHelper.serializeToJson(goldTa, true);
        String outFile = Paths.get(outDirGold, goldTa.getId() + ".json").toString();
        try {
            logger.trace("Writing file out to '{}'...", outFile);
            LineIO.write(outFile, Collections.singletonList(taJson));
        } catch (IOException e) {
            e.printStackTrace();
            System.exit(-1);
        }
        outFile = Paths.get(outDirPred, predTa.getId() + ".json").toString();
        String predTaJson = SerializationHelper.serializeToJson(predTa, true);
        try {
            logger.debug("writing file '{}'...", outFile);
            LineIO.write(outFile, Collections.singletonList(predTaJson));
        } catch (IOException e) {
            e.printStackTrace();
            System.exit(-1);
        }
        logger.debug("## finished processing file '{}'.", goldTa.getId());
    }
    System.out.println(reader.generateReport());
    System.out.print("TOKEN PERFORMANCE:");
    computeAndPrintAcc(numGoldTokCorrect, numGoldTokTotal);
    System.out.print("SENTENCE PERFORMANCE:");
    computeAndPrintAcc(numGoldSentCorrect, numGoldSentTotal);
}
Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) XMLStreamException(javax.xml.stream.XMLStreamException) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)

Example 22 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class MascReader method removeOverlappingSentences.

/**
 * This method may be redundant at this point
 * @param sentences
 */
private void removeOverlappingSentences(List<SentenceStaxParser.MascSentence> sentences) {
    Set<IntPair> offsetsToRemove = new HashSet<>();
    Map<IntPair, SentenceStaxParser.MascSentence> offsetsToSentences = new HashMap<>();
    Set<SentenceStaxParser.MascSentence> sentsToRemove = new HashSet<>();
    for (SentenceStaxParser.MascSentence sent : sentences) {
        IntPair sentOffset = new IntPair(sent.start, sent.end);
        for (IntPair offset : offsetsToSentences.keySet()) {
            if (isInside(sentOffset, offset)) {
                sentsToRemove.add(sent);
                break;
            } else if (isInside(offset, sentOffset))
                offsetsToRemove.add(offset);
            else if (isOverlap(offset, sentOffset)) {
                if (isLarger(offset, sentOffset)) {
                    sentsToRemove.add(sent);
                    break;
                } else
                    offsetsToRemove.add(offset);
            }
        }
    }
    logger.debug("## removing at least {}, and at most {}, sentences...", sentsToRemove.size(), (sentsToRemove.size() + offsetsToRemove.size()));
    for (SentenceStaxParser.MascSentence sent : sentsToRemove) {
        sentences.remove(sent);
    }
    for (IntPair offset : offsetsToRemove) sentences.remove(offsetsToSentences.get(offset));
}
Also used : TIntIntHashMap(gnu.trove.map.hash.TIntIntHashMap) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 23 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class SRLNode method compileLinks.

/**
 * compile a list of predicates.
 * @param tree the tree with the data.
 * @param tokenmap the token maps with the terminal nodes.
 * @return the string indicating the leaves.
 */
public ArrayList<PredicateArgument> compileLinks(HashMap<Integer, Tree<Constituent>> tokenmap) {
    ArrayList<PredicateArgument> map = new ArrayList<PredicateArgument>();
    // then compile all the tokens in that subtree.
    for (SRLLink link : links) {
        Tree<Constituent> node = tokenmap.get(link.where.getFirst());
        for (int i = link.where.getSecond(); i > 0 && node.getParent() != null; i--) {
            Tree<Constituent> up = node.getParent();
            node = up;
        }
        Constituent constituent = node.getLabel();
        map.add(new PredicateArgument(link.argument, link.link, new IntPair(constituent.getStartSpan(), constituent.getEndSpan())));
    }
    return map;
}
Also used : ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 24 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class PARC3Reader method getAnnotationsFromFile.

/**
 * Parse a document into an {@link edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation
 * TextAnnotation}. By default TOKEN and SENTENCE view will be  populated. Other gold views will only be
 * populated if set in configurations
 *
 * @param list a list of containing one path to a xml document
 * @return a list containing one TextAnnotation, corresponding to one source text file plus
 *         annotations
 * @throws Exception if files can't be found, or if parser fails to read annotation format
 */
@Override
public List<TextAnnotation> getAnnotationsFromFile(List<Path> list) throws Exception {
    List<TextAnnotation> result = new ArrayList<>();
    for (Path p : list) {
        String fileStem = IOUtils.getFileStem(p.toFile().getName());
        logger.info("Processing: {}", fileStem);
        // Tokens, Sentences, POS and Lemma
        List<String> tokens = new ArrayList<>();
        List<IntPair> charOffsets = new ArrayList<>();
        List<Integer> sentTokOffset = new ArrayList<>();
        List<String> POStags = new ArrayList<>();
        List<String> lemmas = new ArrayList<>();
        // Attribution Relations - each entry in the map corresponds to one set of attribution relation
        Map<String, AttributionRelation> attrRelations = new HashMap<>();
        // Text
        StringBuilder text = new StringBuilder();
        int lastWordEndByteOffset = 0;
        int tokenIdx = 0;
        Document doc = XMLUtils.getXMLDOM(p.toString());
        // Optional, we don't actually need this, as of now.
        doc.getDocumentElement().normalize();
        NodeList sentences = doc.getElementsByTagName(NODE_SENTENCE);
        for (int sid = 0; sid < sentences.getLength(); sid++) {
            Element sent = (Element) sentences.item(sid);
            NodeList words = sent.getElementsByTagName(NODE_WORD);
            for (int wid = 0; wid < words.getLength(); wid++) {
                Element word = (Element) words.item(wid);
                NodeList attrRels = word.getElementsByTagName(NODE_ATTRIBUTION);
                for (int aid = 0; aid < attrRels.getLength(); aid++) {
                    Element attrRel = (Element) attrRels.item(aid);
                    String relationId = attrRel.getAttribute(ATTR_RELATION_ID);
                    // Get attribution role(s) for current token
                    NodeList attrRoles = attrRel.getElementsByTagName(NODE_ATTRIBUTION_ROLE);
                    for (int arid = 0; arid < attrRoles.getLength(); arid++) {
                        Element attrRole = (Element) attrRoles.item(arid);
                        String role = attrRole.getAttribute(ATTR_ROLE_VALUE);
                        updateAttributionRelation(attrRelations, relationId, role, tokenIdx);
                    }
                }
                String wordText = word.getAttribute(ATTR_WORD_TEXT);
                String pos = word.getAttribute(ATTR_POS);
                String lem = word.getAttribute(ATTR_LEM);
                String[] byteOffsetStr = word.getAttribute(ATTR_BYTE_COUNT).split(",");
                IntPair oracleByteOffset = new IntPair(Integer.parseInt(byteOffsetStr[0]), // This is byte offset according to PARC, which is not accurate
                Integer.parseInt(byteOffsetStr[1]));
                // fill whitespace and update current word to text
                int numWhiteSpace = oracleByteOffset.getFirst() - lastWordEndByteOffset;
                text.append(String.join("", Collections.nCopies(numWhiteSpace, " ")));
                int startCharOffset = text.length();
                text.append(wordText);
                int endCharOffset = text.length();
                lastWordEndByteOffset = oracleByteOffset.getSecond();
                // Update token and token offset
                tokens.add(wordText);
                charOffsets.add(new IntPair(startCharOffset, endCharOffset));
                tokenIdx++;
                // Update sentence token offset
                if (wid == words.getLength() - 1)
                    sentTokOffset.add(tokenIdx);
                // Update POS tags
                POStags.add(pos);
                lemmas.add(lem);
            }
        }
        TextAnnotation ta = new TextAnnotation(super.corpusName, fileStem, text.toString(), charOffsets.toArray(new IntPair[0]), tokens.toArray(new String[0]), sentTokOffset.stream().mapToInt(i -> i).toArray());
        if (bPopulatePOS)
            populatePOS(ta, POStags);
        if (bPopulateLemma)
            populateLemma(ta, lemmas);
        populateAttribution(ta, attrRelations);
        result.add(ta);
    }
    return result;
}
Also used : Path(java.nio.file.Path) NodeList(org.w3c.dom.NodeList) Element(org.w3c.dom.Element) Document(org.w3c.dom.Document) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)

Example 25 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ConvertOntonotesToColumn method getNameTextAnnotation.

/**
 * read the file indicated by the argument which is the file name, and path.
 * @param file the file to read.
 * @param document the data read from the file.
 * @return the XmlTextAnnotation containing the text annotation, and xml markup offset data.
 * @throws IOException
 */
private static XmlTextAnnotation getNameTextAnnotation(File file) throws IOException {
    String document = LineIO.slurp(file.getCanonicalPath());
    // we keep everything.
    XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
    StatefulTokenizer st = new StatefulTokenizer();
    TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
    XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
    // read the file and create the annotation.
    XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test");
    TextAnnotation ta = xta.getTextAnnotation();
    List<SpanInfo> fudge = xta.getXmlMarkup();
    // create the named entity vi
    View nerView = new SpanLabelView(ViewNames.NER_ONTONOTES, ta);
    for (SpanInfo si : fudge) {
        if ("enamex".equalsIgnoreCase(si.label)) {
            IntPair charOffsets = si.spanOffsets;
            String neLabel = si.attributes.get("type").getFirst();
            int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
            int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
            int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
            // StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
            int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
            // constituent token indexing uses one-past-the-end
            Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
            nerView.addConstituent(neCon);
        }
    }
    ta.addView(ViewNames.NER_ONTONOTES, nerView);
    return xta;
}
Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) SpanInfo(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor.SpanInfo) XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) XmlTextAnnotationMaker(edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Aggregations

IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)129 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)39 ArrayList (java.util.ArrayList)27 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)26 Test (org.junit.Test)21 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)18 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)14 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)8 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)7 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)6 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)5 Matcher (java.util.regex.Matcher)5 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)4 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)4 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)4 FileNotFoundException (java.io.FileNotFoundException)4 IOException (java.io.IOException)4 JsonObject (com.google.gson.JsonObject)3 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)3 PredicateArgumentView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.PredicateArgumentView)3