Search in sources :

Example 1 with TextAnnotationBuilder

use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class HashCollisionReport method main.

/**
 * Read each test file in the directory, tokenize and create the token view. Then check for
 * collisions.
 * @param args
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    if (args.length == 0)
        error("Must pass in the name of a directory with files to test against.");
    File dir = new File(args[0]);
    if (!dir.exists()) {
        error("The directory did not exist : " + dir);
    }
    if (!dir.isDirectory()) {
        error("The path was not a directory : " + dir);
    }
    File[] files = dir.listFiles();
    for (File file : files) {
        if (file.isFile()) {
            String normal = FileUtils.readFileToString(file);
            TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
            TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
            List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
            HashMap<Integer, Constituent> hashmap = new HashMap<>();
            // is already used, if it is report it.
            for (Constituent c : normalToks) {
                int code = c.hashCode();
                if (hashmap.containsKey(code)) {
                    Constituent dup = hashmap.get(code);
                    System.err.println(c + " == " + dup);
                } else {
                    hashmap.put(code, c);
                }
            }
        }
    }
}
Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) HashMap(java.util.HashMap) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) File(java.io.File) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 2 with TextAnnotationBuilder

use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testCharacterOffsetToTokenIndex.

/**
 * test whether the mapping between character offset and token index is correct.
 */
@Test
public void testCharacterOffsetToTokenIndex() {
    String normal = "The ordinary sample.\n\nDon't mess things up.";
    String leadingWaste = "<ignoreme>wastedspace</ignoreme>";
    String postWaste = "   \n<ignoremetoo>aaaargh</ignoremetoo>";
    String other = leadingWaste + normal + postWaste;
    TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
    List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
    assertEquals(13, normalToks.get(2).getStartCharOffset());
    assertEquals(24, normalToks.get(5).getStartCharOffset());
    int ignoreUpToOffset = leadingWaste.length();
    IntPair[] characterOffsets = new IntPair[10];
    String[] tokens = taNormal.getTokens();
    for (int i = 0; i < normalToks.size(); ++i) {
        Constituent t = normalToks.get(i);
        characterOffsets[i] = new IntPair(ignoreUpToOffset + t.getStartCharOffset(), ignoreUpToOffset + t.getEndCharOffset());
    }
    List<Constituent> sentences = taNormal.getView(ViewNames.SENTENCE).getConstituents();
    int[] sentenceEndPositions = new int[sentences.size()];
    for (int i = 0; i < sentences.size(); ++i) {
        Constituent s = sentences.get(i);
        sentenceEndPositions[i] = s.getEndSpan();
    }
    // all info should be same except initial char offsets of tokens ignore spans of text
    TextAnnotation taOther = new TextAnnotation("test", "other", other, characterOffsets, tokens, sentenceEndPositions);
    List<Constituent> otherToks = taOther.getView(ViewNames.TOKENS).getConstituents();
    int thirdTokNormalStart = normalToks.get(2).getStartCharOffset();
    int thirdTokOtherStart = otherToks.get(2).getStartCharOffset();
    assertEquals(thirdTokOtherStart, (thirdTokNormalStart + leadingWaste.length()));
    int eighthTokNormalStart = normalToks.get(8).getStartCharOffset();
    int eighthTokOtherStart = otherToks.get(8).getStartCharOffset();
    assertEquals(eighthTokOtherStart, (eighthTokNormalStart + leadingWaste.length()));
    int meaninglessStartOffset = taOther.getTokenIdFromCharacterOffset(2);
    assertEquals(-1, meaninglessStartOffset);
    int meaninglessPastEndOffset = taOther.getTokenIdFromCharacterOffset(leadingWaste.length() + normal.length() + 5);
    assertEquals(-1, meaninglessPastEndOffset);
    int meaninglessInBetweenToksOffset = taNormal.getTokenIdFromCharacterOffset(20);
    assertEquals(-1, meaninglessInBetweenToksOffset);
}
Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) Test(org.junit.Test)

Example 3 with TextAnnotationBuilder

use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class NerInitTest method testInit.

@Test
public void testInit() {
    Properties props = new Properties();
    props.setProperty(NerBaseConfigurator.GAZETTEER_FEATURES, "0");
    // props.setProperty(NerBaseConfigurator.BROWN_CLUSTER_PATHS, "0");
    props.setProperty(NerBaseConfigurator.RANDOM_NOISE_LEVEL, "0.0");
    props.setProperty(NerBaseConfigurator.OMISSION_RATE, "0.0");
    ResourceManager rm = (new NerBaseConfigurator()).getConfig(new ResourceManager(props));
    NERAnnotator ner = NerAnnotatorManager.buildNerAnnotator(rm, ViewNames.NER_CONLL);
    assertNotNull(ner);
    TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    TextAnnotation ta = tab.createTextAnnotation(TESTSTR);
    try {
        ner.getView(ta);
    } catch (AnnotatorException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    assert (ta.hasView(ViewNames.NER_CONLL));
    assertTrue(ta.getView(ViewNames.NER_CONLL).getConstituents().size() >= 1);
}
Also used : NerBaseConfigurator(edu.illinois.cs.cogcomp.ner.config.NerBaseConfigurator) TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) AnnotatorException(edu.illinois.cs.cogcomp.annotation.AnnotatorException) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager) Properties(java.util.Properties) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Test(org.junit.Test)

Example 4 with TextAnnotationBuilder

use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class MascReader method main.

/**
 * Read sections of corpus into TextAnnotations, write out TextAnnotations in json format.
 * Specify MASC root dir of written files, e.g. /home/mssammon/work/data/masc-ccg/written/
 * @param args
 */
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage: " + NAME + " mascCorpusDir outDir");
        System.exit(-1);
    }
    String corpusDir = args[0];
    String outDirGold = args[1];
    String outDirPred = outDirGold + "_PRED";
    Properties props = new Properties();
    props.setProperty(CorpusReaderConfigurator.CORPUS_DIRECTORY.key, corpusDir);
    props.setProperty(CorpusReaderConfigurator.SOURCE_DIRECTORY.key, corpusDir);
    IOUtils.mkdir(outDirGold);
    IOUtils.mkdir(outDirPred);
    ResourceManager rm = new ResourceManager(props);
    MascReader reader = null;
    try {
        reader = new MascReader(rm);
    } catch (Exception e) {
        e.printStackTrace();
        System.exit(-1);
    }
    TextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, false));
    int numGoldTokCorrect = 0;
    int numGoldTokTotal = 0;
    int numGoldSentCorrect = 0;
    int numGoldSentTotal = 0;
    while (reader.hasNext()) {
        TextAnnotation goldTa = reader.next();
        String text = goldTa.getText();
        // Tokenizer.Tokenization tknz = tokenizer.tokenizeTextSpan(text);
        TextAnnotation predTa = taBldr.createTextAnnotation(goldTa.getCorpusId() + "_PREDICTED", goldTa.getId(), text);
        IntPair[] goldTokCharOffsets = getCharacterOffsets(goldTa.getView(ViewNames.TOKENS));
        numGoldTokTotal += goldTokCharOffsets.length;
        numGoldTokCorrect += countCorrectSpans(predTa.getView(ViewNames.TOKENS), goldTokCharOffsets);
        IntPair[] goldSentCharOffsets = getCharacterOffsets(goldTa.getView(ViewNames.SENTENCE));
        numGoldSentTotal += goldSentCharOffsets.length;
        numGoldSentCorrect += countCorrectSpans(predTa.getView(ViewNames.SENTENCE), goldSentCharOffsets);
        String taJson = SerializationHelper.serializeToJson(goldTa, true);
        String outFile = Paths.get(outDirGold, goldTa.getId() + ".json").toString();
        try {
            logger.trace("Writing file out to '{}'...", outFile);
            LineIO.write(outFile, Collections.singletonList(taJson));
        } catch (IOException e) {
            e.printStackTrace();
            System.exit(-1);
        }
        outFile = Paths.get(outDirPred, predTa.getId() + ".json").toString();
        String predTaJson = SerializationHelper.serializeToJson(predTa, true);
        try {
            logger.debug("writing file '{}'...", outFile);
            LineIO.write(outFile, Collections.singletonList(predTaJson));
        } catch (IOException e) {
            e.printStackTrace();
            System.exit(-1);
        }
        logger.debug("## finished processing file '{}'.", goldTa.getId());
    }
    System.out.println(reader.generateReport());
    System.out.print("TOKEN PERFORMANCE:");
    computeAndPrintAcc(numGoldTokCorrect, numGoldTokTotal);
    System.out.print("SENTENCE PERFORMANCE:");
    computeAndPrintAcc(numGoldSentCorrect, numGoldSentTotal);
}
Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) XMLStreamException(javax.xml.stream.XMLStreamException) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)

Example 5 with TextAnnotationBuilder

use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class MultilingualEreReaderTest method testChinese.

public static void testChinese() {
    EREEventReader reader = null;
    try {
        boolean throwExceptionOnXmlParseFail = true;
        TextAnnotationBuilder chineseTaBldr = MultiLingualTokenizer.getTokenizer(Language.Chinese.getCode());
        reader = new EREEventReader(EREDocumentReader.EreCorpus.ENR3, chineseTaBldr, chinesePathB, throwExceptionOnXmlParseFail);
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    testReader(reader);
}
Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) EREEventReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREEventReader)

Aggregations

TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)22 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)20 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)16 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)15 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)7 POSAnnotator (edu.illinois.cs.cogcomp.pos.POSAnnotator)7 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)6 ResourceManager (edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager)6 Properties (java.util.Properties)6 Test (org.junit.Test)5 ChunkerAnnotator (edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator)3 ChunkerConfigurator (edu.illinois.cs.cogcomp.chunker.main.ChunkerConfigurator)3 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)3 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)3 StanfordDepHandler (edu.illinois.cs.cogcomp.pipeline.handlers.StanfordDepHandler)3 POSTaggerAnnotator (edu.stanford.nlp.pipeline.POSTaggerAnnotator)3 ParserAnnotator (edu.stanford.nlp.pipeline.ParserAnnotator)3 File (java.io.File)3 ArrayList (java.util.ArrayList)3 EREEventReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREEventReader)2