Search in sources :

Example 6 with Paragraph

use of edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph in project cogcomp-nlp by CogComp.

the class ACE_WL_Reader method parse.

public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
    List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
    Map<String, String> metadata = new HashMap<>();
    Pattern pattern = null;
    Matcher matcher = null;
    String docID = "";
    String dateTime = "";
    String headLine = "";
    String text = "";
    pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        docID = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentCreationTime, docID);
    pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        dateTime = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
    pattern = Pattern.compile("<HEADLINE>(.*?)</HEADLINE>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        headLine = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.HeadLine, headLine);
    pattern = Pattern.compile("<POST>(.*?)</POST>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        text = (matcher.group(1)).trim();
        text = text.substring(text.indexOf("</POSTDATE>") + "</POSTDATE>".length()).trim();
        int index4 = content.indexOf(text);
        Paragraph para4 = new Paragraph(index4, text);
        Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
        paragraphs.add(pair4);
    }
    int index = 0;
    for (int i = 0; i < paragraphs.size(); ++i) {
        int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
        paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
        index += paragraphs.get(i).getSecond().content.length();
    }
    if (isDebug) {
        for (int i = 0; i < paragraphs.size(); ++i) {
            logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
            logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
            logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
            logger.info("\n");
        }
    }
    if (isDebug) {
        for (int i = 0; i < paragraphs.size(); ++i) {
            logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
            logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
            logger.info("\n");
        }
    }
    return new Pair<>(paragraphs, metadata);
}
Also used : Pattern(java.util.regex.Pattern) HashMap(java.util.HashMap) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair) Paragraph(edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)

Aggregations

Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)6 Paragraph (edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)6 ArrayList (java.util.ArrayList)6 HashMap (java.util.HashMap)6 Matcher (java.util.regex.Matcher)6 Pattern (java.util.regex.Pattern)6