Search in sources :

Example 16 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class ACE_NW_Reader method parse.

public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
    List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
    Map<String, String> metadata = new HashMap<>();
    Pattern pattern = null;
    Matcher matcher = null;
    String docID = "";
    String dateTime = "";
    String headLine = "";
    String text = "";
    pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        docID = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentID, docID);
    pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        dateTime = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
    pattern = Pattern.compile("<HEADLINE>(.*?)</HEADLINE>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        headLine = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.HeadLine, headLine);
    pattern = Pattern.compile("<TEXT>(.*?)</TEXT>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        text = (matcher.group(1)).trim();
        int index4 = content.indexOf(text);
        Paragraph para4 = new Paragraph(index4, text);
        Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
        paragraphs.add(pair4);
    }
    int index = 0;
    for (int i = 0; i < paragraphs.size(); ++i) {
        int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
        paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
        index += paragraphs.get(i).getSecond().content.length();
    }
    if (isDebug) {
        for (int i = 0; i < paragraphs.size(); ++i) {
            logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
            logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
            logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
            logger.info("\n");
        }
    }
    return new Pair<>(paragraphs, metadata);
}
Also used : Pattern(java.util.regex.Pattern) HashMap(java.util.HashMap) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair) Paragraph(edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)

Example 17 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class ACE_UN_Reader method parse.

public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
    List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
    Map<String, String> metadata = new HashMap<>();
    Pattern pattern = null;
    Matcher matcher = null;
    String docID = "";
    String dateTime = "";
    String headLine = "";
    String text = "";
    pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        docID = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentID, docID);
    pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        dateTime = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
    pattern = Pattern.compile("<HEADLINE>(.*?)</HEADLINE>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        headLine = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.HeadLine, headLine);
    pattern = Pattern.compile("<POST>(.*?)</POST>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        text = (matcher.group(1)).trim();
        int index4 = content.indexOf(text);
        Pattern patternQuote = Pattern.compile("<SUBJECT>(.*?)</SUBJECT>");
        Matcher matcherQuote = patternQuote.matcher(text);
        while (matcherQuote.find()) {
            String subject = (matcherQuote.group(1)).trim();
            int indexsubject = text.indexOf(subject) + index4;
            Paragraph paraSub = new Paragraph(indexsubject, subject);
            Pair<String, Paragraph> pair = new Pair<String, Paragraph>("postSubject", paraSub);
            paragraphs.add(pair);
        }
        patternQuote = Pattern.compile("<POSTER>(.*?)</POSTER>");
        matcherQuote = patternQuote.matcher(text);
        while (matcherQuote.find()) {
            String quote = (matcherQuote.group(1)).trim();
            int indexQuote = text.indexOf(quote) + index4;
            Paragraph paraSub = new Paragraph(indexQuote, quote);
            Pair<String, Paragraph> pair = new Pair<String, Paragraph>("poster", paraSub);
            paragraphs.add(pair);
        }
        patternQuote = Pattern.compile("<POSTDATE>(.*?)</POSTDATE>");
        matcherQuote = patternQuote.matcher(text);
        while (matcherQuote.find()) {
            String quote = (matcherQuote.group(1)).trim();
            int indexQuote = text.indexOf(quote) + index4;
            Paragraph paraSub = new Paragraph(indexQuote, quote);
            Pair<String, Paragraph> pair = new Pair<String, Paragraph>("postDate", paraSub);
            paragraphs.add(pair);
        }
        patternQuote = Pattern.compile("<QUOTE PREVIOUSPOST=\"(.*?)\"/>");
        matcherQuote = patternQuote.matcher(text);
        while (matcherQuote.find()) {
            String quote = (matcherQuote.group(1)).trim();
            int indexQuote = text.indexOf(quote) + index4;
            Paragraph paraSub = new Paragraph(indexQuote, quote);
            Pair<String, Paragraph> pair = new Pair<String, Paragraph>("postQuote", paraSub);
            paragraphs.add(pair);
        }
        if (text.contains("<QUOTE PREVIOUSPOST=")) {
            patternQuote = Pattern.compile("</SUBJECT>(.*?)<QUOTE PREVIOUSPOST=");
            matcherQuote = patternQuote.matcher(text);
            while (matcherQuote.find()) {
                String newText = (matcherQuote.group(1)).trim();
                if (newText.equals("")) {
                    continue;
                }
                if (newText.contains("</SUBJECT>"))
                    newText = newText.substring(newText.indexOf("</SUBJECT>") + "</SUBJECT>".length()).trim();
                int indexNewText = text.indexOf(newText) + index4;
                Paragraph paraNewText = new Paragraph(indexNewText, newText);
                Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
                paragraphs.add(pair);
            }
            patternQuote = Pattern.compile("\"/>(.*?)</POST>");
            matcherQuote = patternQuote.matcher(text + "</POST>");
            while (matcherQuote.find()) {
                String newText = (matcherQuote.group(1)).trim();
                if (newText.equals("") || newText.contains("<QUOTE PREVIOUSPOST=")) {
                    continue;
                }
                int indexNewText = text.indexOf(newText) + index4;
                Paragraph paraNewText = new Paragraph(indexNewText, newText);
                Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
                paragraphs.add(pair);
            }
            patternQuote = Pattern.compile("\"/>(.*?)<QUOTE PREVIOUSPOST=");
            matcherQuote = patternQuote.matcher(text);
            while (matcherQuote.find()) {
                String newText = (matcherQuote.group(1)).trim();
                if (newText.equals("")) {
                    continue;
                }
                int indexNewText = text.indexOf(newText) + index4;
                Paragraph paraNewText = new Paragraph(indexNewText, newText);
                Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
                paragraphs.add(pair);
            }
        } else {
            patternQuote = Pattern.compile("</SUBJECT>(.*?)</POST>");
            matcherQuote = patternQuote.matcher(text + "</POST>");
            while (matcherQuote.find()) {
                String newText = (matcherQuote.group(1)).trim();
                int indexNewText = text.indexOf(newText) + index4;
                Paragraph paraNewText = new Paragraph(indexNewText, newText);
                Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
                paragraphs.add(pair);
            }
        }
    }
    int index = 0;
    for (int i = 0; i < paragraphs.size(); ++i) {
        int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
        if (offsetWithFiltering == -1) {
            continue;
        }
        paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
        if (paragraphs.get(i).getFirst().equals("poster"))
            index += paragraphs.get(i).getSecond().content.length();
    }
    if (isDebug) {
        for (int i = 0; i < paragraphs.size(); ++i) {
            logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
            logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
            if (paragraphs.get(i).getSecond().offsetFilterTags == -1) {
                logger.info("[No match phrase in filtered content.]");
            } else {
                logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
            }
            logger.info("\n");
        }
    }
    return new Pair<>(paragraphs, metadata);
}
Also used : Pattern(java.util.regex.Pattern) HashMap(java.util.HashMap) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair) Paragraph(edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)

Example 18 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class DepAnnotator method addView.

@Override
public void addView(TextAnnotation ta) throws AnnotatorException {
    for (String reqView : requiredViews) if (!ta.hasView(reqView))
        throw new AnnotatorException("TextAnnotation must have view: " + reqView);
    DepInst sent = new DepInst(ta);
    DepStruct deptree;
    try {
        deptree = (DepStruct) model.infSolver.getBestStructure(model.wv, sent);
    } catch (Exception e) {
        throw new AnnotatorException("Sentence cannot be parsed");
    }
    TreeView treeView = new TreeView(ViewNames.DEPENDENCY, ta);
    int rootPos = findRoot(deptree);
    // All the node positions are -1 to account for the extra <root> node added
    Pair<String, Integer> nodePair = new Pair<>(sent.forms[rootPos], rootPos - 1);
    Tree<Pair<String, Integer>> tree = new Tree<>(nodePair);
    populateChildren(tree, deptree, sent, rootPos);
    treeView.setDependencyTree(0, tree);
    ta.addView(ViewNames.DEPENDENCY, treeView);
}
Also used : AnnotatorException(edu.illinois.cs.cogcomp.annotation.AnnotatorException) TreeView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TreeView) Tree(edu.illinois.cs.cogcomp.core.datastructures.trees.Tree) DepInst(edu.illinois.cs.cogcomp.depparse.core.DepInst) DepStruct(edu.illinois.cs.cogcomp.depparse.core.DepStruct) URISyntaxException(java.net.URISyntaxException) IOException(java.io.IOException) AnnotatorException(edu.illinois.cs.cogcomp.annotation.AnnotatorException) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 19 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class FeatureUtilities method convert.

/**
     * Convert a feature set into a pair of arrays of integers and doubles by looking up the feature
     * name in the provided lexicon.
     *
     * @param features The feature set
     * @param lexicon The lexicon
     * @param trainingMode Should an unseen feature string be added to the lexicon? If this is
     *        false, unseen features will be given an ID whose value is one more than the number of
     *        features.
     * @return a pair of int[] and double[], representing the feature ids and values.
     */
public static Pair<int[], double[]> convert(Set<Feature> features, Lexicon lexicon, boolean trainingMode) {
    TIntDoubleHashMap fMap = new TIntDoubleHashMap(features.size());
    for (Feature feature : features) {
        final int featureId = FeatureUtilities.getFeatureId(lexicon, trainingMode, feature);
        if (featureId < 0)
            continue;
        double value = feature.getValue() + fMap.get(featureId);
        fMap.put(featureId, value);
    }
    int[] idsOriginal = fMap.keys();
    int[] ids = new int[idsOriginal.length];
    System.arraycopy(idsOriginal, 0, ids, 0, ids.length);
    Arrays.sort(ids);
    double[] vals = new double[fMap.size()];
    int count = 0;
    for (int key : ids) {
        vals[count++] = fMap.get(key);
    }
    return new Pair<>(ids, vals);
}
Also used : TIntDoubleHashMap(gnu.trove.map.hash.TIntDoubleHashMap) RealPrimitiveFeature(edu.illinois.cs.cogcomp.lbjava.classify.RealPrimitiveFeature) DiscretePrimitiveFeature(edu.illinois.cs.cogcomp.lbjava.classify.DiscretePrimitiveFeature) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 20 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class ParseHelper method getPhraseFromHead.

/**
     * Primarily a fix for prepSRL objects; converts them from single head words to constituents.
     * E.g. for the sentence "the man with the telescope", the object of the preposition will be
     * "the telescope" instead of just "telescope".
     *
     * @param predicate The predicate of the construction (e.g. "with")
     * @param argHead The head-word of the argument of the construction (e.g. "telescope")
     * @param parseViewName The name of the parse view used to extract the phrase-structure tree
     * @return The full constituent phrase containing the argument head
     */
public static Constituent getPhraseFromHead(Constituent predicate, Constituent argHead, String parseViewName) {
    // Get the path from the argument to the preposition
    // but only if the predicate node "m-commands" the arg
    TextAnnotation ta = argHead.getTextAnnotation();
    int sentenceOffset = ta.getSentence(ta.getSentenceId(argHead)).getStartSpan();
    int argStart = argHead.getStartSpan() - sentenceOffset;
    Tree<Pair<String, IntPair>> predParentTree = getTokenIndexedTreeCovering(predicate, parseViewName).getParent();
    boolean found = false;
    for (Tree<Pair<String, IntPair>> s : predParentTree.getYield()) {
        if (s.getLabel().getSecond().getFirst() == argStart)
            found = true;
    }
    if (!found)
        return null;
    // Now follow the path from the argument node to get to the preposition
    Tree<Pair<String, IntPair>> argPhrase = getTokenIndexedTreeCovering(argHead, parseViewName);
    while (!checkForPredicate(argPhrase.getParent(), predicate.getStartSpan() - sentenceOffset)) {
        if (argPhrase.getParent() == null)
            break;
        argPhrase = argPhrase.getParent();
    }
    // If the phrase covering the constituent is the whole sentence then the annotation is wrong
    if (argPhrase.getParent() == null)
        return null;
    int start = predicate.getStartSpan() + 1;
    int end = start + argPhrase.getYield().size();
    return new Constituent(argHead.getLabel(), argHead.getViewName(), argHead.getTextAnnotation(), start, end);
}
Also used : TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Aggregations

Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)59 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)35 ArrayList (java.util.ArrayList)17 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)10 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)10 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)7 Matcher (java.util.regex.Matcher)7 Paragraph (edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)6 HashMap (java.util.HashMap)6 Pattern (java.util.regex.Pattern)6 TreeView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TreeView)3 SenseInstance (edu.illinois.cs.cogcomp.verbsense.jlis.SenseInstance)3 SenseStructure (edu.illinois.cs.cogcomp.verbsense.jlis.SenseStructure)3 JsonObject (com.google.gson.JsonObject)2 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)2 ITransformer (edu.illinois.cs.cogcomp.core.transformers.ITransformer)2 IndexedWord (edu.stanford.nlp.ling.IndexedWord)2 Annotation (edu.stanford.nlp.pipeline.Annotation)2 CoreMap (edu.stanford.nlp.util.CoreMap)2 LinkedHashSet (java.util.LinkedHashSet)2