Search in sources :

Example 1 with Paragraph

use of edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph in project cogcomp-nlp by CogComp.

the class ACE_BN_Reader method parse.

public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags, boolean is2004) {
    List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
    Map<String, String> metadata = new HashMap<>();
    Pattern pattern = null;
    Matcher matcher = null;
    String docID = "";
    String dateTime = "";
    String headLine = "";
    String text = "";
    pattern = is2004 ? Pattern.compile("<DOCNO>(.*?)</DOCNO>") : Pattern.compile("<DOCID>(.*?)</DOCID>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        docID = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentID, docID);
    pattern = is2004 ? Pattern.compile("<DATE_TIME>(.*?)</DATE_TIME>") : Pattern.compile("<DATETIME>(.*?)</DATETIME>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        dateTime = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
    if (is2004) {
        pattern = Pattern.compile("<TEXT>(.*?)<TURN>|<TURN>(.*?)<TURN>|<TURN>(.*?)</TEXT>|<TEXT>(.*?)</TEXT>");
    } else {
        pattern = Pattern.compile("<TURN>(.*?)</TURN>");
    }
    matcher = pattern.matcher(content);
    int regionStart = 0;
    while (matcher.find(regionStart)) {
        // Pick the first non-empty group.
        for (int i = 1; i <= matcher.groupCount(); ++i) {
            if (matcher.group(i) != null) {
                text = (matcher.group(i)).trim();
                break;
            }
        }
        int index4 = content.indexOf(text);
        Paragraph para4 = new Paragraph(index4, text);
        Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
        paragraphs.add(pair4);
        if (is2004) {
            // Hack to move back to the overlapping <TURN> tag
            regionStart = matcher.end() - 6;
        } else {
            regionStart = matcher.end();
        }
    }
    int index = 0;
    for (int i = 0; i < paragraphs.size(); ++i) {
        String paraContent = paragraphs.get(i).getSecond().content;
        int offsetWithFiltering = contentRemovingTags.indexOf(paraContent, index);
        paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
        index += paraContent.length();
    }
    if (isDebug) {
        for (int i = 0; i < paragraphs.size(); ++i) {
            logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
            logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
            logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
            logger.info("\n");
        }
    }
    return new Pair<>(paragraphs, metadata);
}
Also used : Pattern(java.util.regex.Pattern) HashMap(java.util.HashMap) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair) Paragraph(edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)

Example 2 with Paragraph

use of edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph in project cogcomp-nlp by CogComp.

the class ACE_NW_Reader method parse.

public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
    List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
    Map<String, String> metadata = new HashMap<>();
    Pattern pattern = null;
    Matcher matcher = null;
    String docID = "";
    String dateTime = "";
    String headLine = "";
    String text = "";
    pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        docID = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentID, docID);
    pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        dateTime = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
    pattern = Pattern.compile("<HEADLINE>(.*?)</HEADLINE>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        headLine = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.HeadLine, headLine);
    pattern = Pattern.compile("<TEXT>(.*?)</TEXT>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        text = (matcher.group(1)).trim();
        int index4 = content.indexOf(text);
        Paragraph para4 = new Paragraph(index4, text);
        Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
        paragraphs.add(pair4);
    }
    int index = 0;
    for (int i = 0; i < paragraphs.size(); ++i) {
        int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
        paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
        index += paragraphs.get(i).getSecond().content.length();
    }
    if (isDebug) {
        for (int i = 0; i < paragraphs.size(); ++i) {
            logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
            logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
            logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
            logger.info("\n");
        }
    }
    return new Pair<>(paragraphs, metadata);
}
Also used : Pattern(java.util.regex.Pattern) HashMap(java.util.HashMap) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair) Paragraph(edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)

Example 3 with Paragraph

use of edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph in project cogcomp-nlp by CogComp.

the class ACE_UN_Reader method parse.

public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
    List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
    Map<String, String> metadata = new HashMap<>();
    Pattern pattern = null;
    Matcher matcher = null;
    String docID = "";
    String dateTime = "";
    String headLine = "";
    String text = "";
    pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        docID = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentID, docID);
    pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        dateTime = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
    pattern = Pattern.compile("<HEADLINE>(.*?)</HEADLINE>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        headLine = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.HeadLine, headLine);
    pattern = Pattern.compile("<POST>(.*?)</POST>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        text = (matcher.group(1)).trim();
        int index4 = content.indexOf(text);
        Pattern patternQuote = Pattern.compile("<SUBJECT>(.*?)</SUBJECT>");
        Matcher matcherQuote = patternQuote.matcher(text);
        while (matcherQuote.find()) {
            String subject = (matcherQuote.group(1)).trim();
            int indexsubject = text.indexOf(subject) + index4;
            Paragraph paraSub = new Paragraph(indexsubject, subject);
            Pair<String, Paragraph> pair = new Pair<String, Paragraph>("postSubject", paraSub);
            paragraphs.add(pair);
        }
        patternQuote = Pattern.compile("<POSTER>(.*?)</POSTER>");
        matcherQuote = patternQuote.matcher(text);
        while (matcherQuote.find()) {
            String quote = (matcherQuote.group(1)).trim();
            int indexQuote = text.indexOf(quote) + index4;
            Paragraph paraSub = new Paragraph(indexQuote, quote);
            Pair<String, Paragraph> pair = new Pair<String, Paragraph>("poster", paraSub);
            paragraphs.add(pair);
        }
        patternQuote = Pattern.compile("<POSTDATE>(.*?)</POSTDATE>");
        matcherQuote = patternQuote.matcher(text);
        while (matcherQuote.find()) {
            String quote = (matcherQuote.group(1)).trim();
            int indexQuote = text.indexOf(quote) + index4;
            Paragraph paraSub = new Paragraph(indexQuote, quote);
            Pair<String, Paragraph> pair = new Pair<String, Paragraph>("postDate", paraSub);
            paragraphs.add(pair);
        }
        patternQuote = Pattern.compile("<QUOTE PREVIOUSPOST=\"(.*?)\"/>");
        matcherQuote = patternQuote.matcher(text);
        while (matcherQuote.find()) {
            String quote = (matcherQuote.group(1)).trim();
            int indexQuote = text.indexOf(quote) + index4;
            Paragraph paraSub = new Paragraph(indexQuote, quote);
            Pair<String, Paragraph> pair = new Pair<String, Paragraph>("postQuote", paraSub);
            paragraphs.add(pair);
        }
        if (text.contains("<QUOTE PREVIOUSPOST=")) {
            patternQuote = Pattern.compile("</SUBJECT>(.*?)<QUOTE PREVIOUSPOST=");
            matcherQuote = patternQuote.matcher(text);
            while (matcherQuote.find()) {
                String newText = (matcherQuote.group(1)).trim();
                if (newText.equals("")) {
                    continue;
                }
                if (newText.contains("</SUBJECT>"))
                    newText = newText.substring(newText.indexOf("</SUBJECT>") + "</SUBJECT>".length()).trim();
                int indexNewText = text.indexOf(newText) + index4;
                Paragraph paraNewText = new Paragraph(indexNewText, newText);
                Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
                paragraphs.add(pair);
            }
            patternQuote = Pattern.compile("\"/>(.*?)</POST>");
            matcherQuote = patternQuote.matcher(text + "</POST>");
            while (matcherQuote.find()) {
                String newText = (matcherQuote.group(1)).trim();
                if (newText.equals("") || newText.contains("<QUOTE PREVIOUSPOST=")) {
                    continue;
                }
                int indexNewText = text.indexOf(newText) + index4;
                Paragraph paraNewText = new Paragraph(indexNewText, newText);
                Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
                paragraphs.add(pair);
            }
            patternQuote = Pattern.compile("\"/>(.*?)<QUOTE PREVIOUSPOST=");
            matcherQuote = patternQuote.matcher(text);
            while (matcherQuote.find()) {
                String newText = (matcherQuote.group(1)).trim();
                if (newText.equals("")) {
                    continue;
                }
                int indexNewText = text.indexOf(newText) + index4;
                Paragraph paraNewText = new Paragraph(indexNewText, newText);
                Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
                paragraphs.add(pair);
            }
        } else {
            patternQuote = Pattern.compile("</SUBJECT>(.*?)</POST>");
            matcherQuote = patternQuote.matcher(text + "</POST>");
            while (matcherQuote.find()) {
                String newText = (matcherQuote.group(1)).trim();
                int indexNewText = text.indexOf(newText) + index4;
                Paragraph paraNewText = new Paragraph(indexNewText, newText);
                Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
                paragraphs.add(pair);
            }
        }
    }
    int index = 0;
    for (int i = 0; i < paragraphs.size(); ++i) {
        int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
        if (offsetWithFiltering == -1) {
            continue;
        }
        paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
        if (paragraphs.get(i).getFirst().equals("poster"))
            index += paragraphs.get(i).getSecond().content.length();
    }
    if (isDebug) {
        for (int i = 0; i < paragraphs.size(); ++i) {
            logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
            logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
            if (paragraphs.get(i).getSecond().offsetFilterTags == -1) {
                logger.info("[No match phrase in filtered content.]");
            } else {
                logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
            }
            logger.info("\n");
        }
    }
    return new Pair<>(paragraphs, metadata);
}
Also used : Pattern(java.util.regex.Pattern) HashMap(java.util.HashMap) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair) Paragraph(edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)

Example 4 with Paragraph

use of edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph in project cogcomp-nlp by CogComp.

the class ACE_BC_Reader method parse.

public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
    List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
    Map<String, String> metadata = new HashMap<>();
    Pattern pattern = null;
    Matcher matcher = null;
    String docID = "";
    String dateTime = "";
    String headLine = "";
    String text = "";
    pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        docID = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentID, docID);
    pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        dateTime = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
    pattern = Pattern.compile("<HEADLINE>(.*?)</HEADLINE>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        headLine = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.HeadLine, headLine);
    pattern = Pattern.compile("<TURN>(.*?)</TURN>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        text = (matcher.group(1)).trim();
        text = text.substring(text.indexOf("</SPEAKER>") + "</SPEAKER>".length()).trim();
        int index4 = content.indexOf(text);
        Paragraph para4 = new Paragraph(index4, text);
        Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
        paragraphs.add(pair4);
    }
    int index = 0;
    for (int i = 0; i < paragraphs.size(); ++i) {
        int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
        paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
        index += paragraphs.get(i).getSecond().content.length();
    }
    if (isDebug) {
        for (int i = 0; i < paragraphs.size(); ++i) {
            logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
            logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
            logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
            logger.info("\n");
        }
    }
    return new Pair<>(paragraphs, metadata);
}
Also used : Pattern(java.util.regex.Pattern) HashMap(java.util.HashMap) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair) Paragraph(edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)

Example 5 with Paragraph

use of edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph in project cogcomp-nlp by CogComp.

the class ACE_CTS_Reader method parse.

public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
    List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
    Map<String, String> metadata = new HashMap<>();
    Pattern pattern = null;
    Matcher matcher = null;
    String docID = "";
    String dateTime = "";
    String headLine = "";
    String text = "";
    pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        docID = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentID, docID);
    pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        dateTime = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
    pattern = Pattern.compile("<TURN>(.*?)</TURN>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        text = (matcher.group(1)).trim();
        text = text.substring(text.indexOf("</SPEAKER>") + "</SPEAKER>".length()).trim();
        int index4 = content.indexOf(text);
        Paragraph para4 = new Paragraph(index4, text);
        Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
        paragraphs.add(pair4);
    }
    int index = 0;
    for (int i = 0; i < paragraphs.size(); ++i) {
        int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
        paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
        index += paragraphs.get(i).getSecond().content.length();
    }
    if (isDebug) {
        for (int i = 0; i < paragraphs.size(); ++i) {
            logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
            logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
            logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
            logger.info("\n");
        }
    }
    return new Pair<>(paragraphs, metadata);
}
Also used : Pattern(java.util.regex.Pattern) HashMap(java.util.HashMap) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair) Paragraph(edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)

Aggregations

Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)6 Paragraph (edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)6 ArrayList (java.util.ArrayList)6 HashMap (java.util.HashMap)6 Matcher (java.util.regex.Matcher)6 Pattern (java.util.regex.Pattern)6