Search in sources :

Example 21 with View

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.

the class MultilingualTokenizeTextToColumn method processFile.

/**
     * given an input containing plain text, tokenize and write to named output file.
     *
     * @param corpus name of corpus
     * @param in file to tokenize
     * @param out output file for tokenized text
     */
public void processFile(String corpus, File in, String out) throws IOException {
    if (!in.exists())
        throw new IOException("File '" + in.getAbsolutePath() + "' doesn't exist.");
    if (!in.isFile())
        throw new IOException("File '" + in.getAbsolutePath() + "' exists but is not a file.");
    //Charset.defaultCharset().name());//
    Scanner scanner = new Scanner(new FileInputStream(in), StandardCharsets.UTF_8.name());
    StringBuilder sb = new StringBuilder();
    while (scanner.hasNextLine()) {
        String line = scanner.nextLine();
        sb.append(line).append("\n");
    }
    scanner.close();
    String str = sb.toString();
    TextAnnotation ta = taBldr.createTextAnnotation(corpus, in.getName(), str);
    View sents = ta.getView(ViewNames.SENTENCE);
    logger.info("processing file '{}'; input length is {}", in.getAbsolutePath(), str.length());
    //        System.err.println("processing file '" + in.getAbsolutePath() + "'..." + " input length: " + str.length());
    List<Constituent> toks = ta.getView(ViewNames.TOKENS).getConstituents();
    //        List<String> outputs = new ArrayList<>();
    StringBuilder bldr = new StringBuilder();
    for (Constituent sent : sents) {
        int index = 1;
        for (Constituent tok : toks) {
            if (tok.getStartCharOffset() >= sent.getStartCharOffset() && tok.getEndCharOffset() <= sent.getEndCharOffset()) {
                bldr.append(Integer.toString(index++)).append("\t").append(tok.getSurfaceForm()).append("\t").append(tok.getStartCharOffset()).append("\t").append(tok.getEndCharOffset()).append(System.lineSeparator());
            }
        }
        // empty line to separate sentences
        bldr.append(System.lineSeparator());
    }
    System.err.println("output length: " + bldr.toString().length());
    //        LineIO.write(out, outputs);
    try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(out)), StandardCharsets.UTF_8.name())) {
        writer.write(bldr.toString());
    } catch (IOException e) {
        logger.error("Can't write to file {}: {}", out, e.getMessage());
        e.printStackTrace();
        throw e;
    }
}
Also used : Scanner(java.util.Scanner) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 22 with View

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.

the class PathLSTMHandler method addView.

@Override
public void addView(TextAnnotation ta) throws AnnotatorException {
    // Check if all required views are present
    try {
        View srlView = getSRL(ta);
        ta.addView(getViewName(), srlView);
    } catch (Exception e) {
        e.printStackTrace();
        throw new AnnotatorException(e.getMessage());
    }
}
Also used : AnnotatorException(edu.illinois.cs.cogcomp.annotation.AnnotatorException) PredicateArgumentView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.PredicateArgumentView) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) DatastoreException(org.cogcomp.DatastoreException) AnnotatorException(edu.illinois.cs.cogcomp.annotation.AnnotatorException) IOException(java.io.IOException)

Example 23 with View

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.

the class LemmatizerTATest method testCreateTextAnnotationLemmaView.

@Test
public void testCreateTextAnnotationLemmaView() {
    View lemmaView = null;
    TextAnnotation ta = inputTa;
    try {
        lemmaView = lem.createLemmaView(ta);
    } catch (IOException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    boolean isTested = false;
    if (null != lemmaView) {
        List<Constituent> spans = lemmaView.getConstituents();
        printConstituents(System.out, spans);
        // orig 'The'
        String the = spans.get(0).getLabel();
        // orig 'men'
        String CIA = spans.get(1).getLabel();
        // orig 'have'
        String thought = spans.get(2).getLabel();
        // orig 'had'
        String had = spans.get(6).getLabel();
        // orig 'examinations'
        String were = spans.get(15).getLabel();
        assertEquals(the, "the");
        assertEquals(CIA, "cia");
        assertEquals(thought, "think");
        assertEquals(had, "have");
        assertEquals(were, "be");
        isTested = true;
    }
    assertTrue(isTested);
}
Also used : IOException(java.io.IOException) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) Test(org.junit.Test)

Example 24 with View

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.

the class NERAnnotatorTest method testTokenization.

/**
     * test tokenization produces the correct number of constinuents.
     */
@Test
public void testTokenization() {
    TextAnnotation ta = tab.createTextAnnotation(TOKEN_TEST);
    View nerView = null;
    try {
        nerView = getView(ta);
    } catch (AnnotatorException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    assertEquals(nerView.getConstituents().size(), 2);
    String tokTestB = "Grigory Pasko, crusading Russian journalist who documented Russian Navy's mishandling of " + "nuclear waste, is released on parole after serving two-thirds of his four-year prison sentence.";
    ta = tab.createTextAnnotation(tokTestB);
    try {
        nerView = getView(ta);
    } catch (AnnotatorException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    assertEquals(3, nerView.getNumberOfConstituents());
}
Also used : AnnotatorException(edu.illinois.cs.cogcomp.annotation.AnnotatorException) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) Test(org.junit.Test)

Example 25 with View

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.

the class NERAnnotatorTest method evaluatePerformance.

/**
     * Make sure it runs in reasonable time. We will test the performance of the machine we run on
     * to get a better measure.
     */
// @Test
public void evaluatePerformance() {
    // now do performance.
    final int SIZE = 100;
    // make sure any lazy loading is done outside the performance test.
    TextAnnotation tat = tab.createTextAnnotation(TEST_INPUT);
    try {
        getView(tat);
    } catch (AnnotatorException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    long expectedPerformance = this.measureMachinePerformance();
    logger.info("Expect " + expectedPerformance);
    {
        TextAnnotation ta = tab.createTextAnnotation(TEST_INPUT);
        View view = null;
        try {
            view = getView(ta);
        } catch (AnnotatorException e) {
            e.printStackTrace();
            fail(e.getMessage());
        }
        assertTrue(view != null);
    }
    // start the performance test.
    long start = System.currentTimeMillis();
    for (int i = 0; i < SIZE; i++) {
        TextAnnotation ta = tab.createTextAnnotation(TEST_INPUT);
        View view = null;
        try {
            view = getView(ta);
        } catch (AnnotatorException e) {
            e.printStackTrace();
            fail(e.getMessage());
        }
        assertTrue(view != null);
        for (Constituent c : view.getConstituents()) {
            assertTrue("No entity named \"" + c.toString() + "\"", entities.contains(c.toString()));
        }
    }
    start = System.currentTimeMillis() - start;
    start /= SIZE;
    System.out.printf("For text size = %d, average NER runtime = %d, normalized = %f", TEST_INPUT.length(), start, (double) start / (double) expectedPerformance);
    assertTrue(start <= expectedPerformance);
}
Also used : AnnotatorException(edu.illinois.cs.cogcomp.annotation.AnnotatorException) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Aggregations

View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)64 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)51 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)49 Feature (edu.illinois.cs.cogcomp.edison.features.Feature)22 Test (org.junit.Test)21 FeatureExtractor (edu.illinois.cs.cogcomp.edison.features.FeatureExtractor)16 ProjectedPath (edu.illinois.cs.cogcomp.edison.features.lrec.ProjectedPath)16 FeatureManifest (edu.illinois.cs.cogcomp.edison.features.manifest.FeatureManifest)16 FileInputStream (java.io.FileInputStream)16 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)7 PredicateArgumentView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.PredicateArgumentView)7 ArrayList (java.util.ArrayList)7 DiscreteFeature (edu.illinois.cs.cogcomp.edison.features.DiscreteFeature)6 LinkedHashSet (java.util.LinkedHashSet)6 Set (java.util.Set)6 POSBaseLineCounter (edu.illinois.cs.cogcomp.edison.utilities.POSBaseLineCounter)5 POSMikheevCounter (edu.illinois.cs.cogcomp.edison.utilities.POSMikheevCounter)5 IOException (java.io.IOException)5 EdisonException (edu.illinois.cs.cogcomp.edison.utilities.EdisonException)4 JsonObject (com.google.gson.JsonObject)3