use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.
the class MultilingualTokenizeTextToColumn method processFile.
/**
* given an input containing plain text, tokenize and write to named output file.
*
* @param corpus name of corpus
* @param in file to tokenize
* @param out output file for tokenized text
*/
public void processFile(String corpus, File in, String out) throws IOException {
if (!in.exists())
throw new IOException("File '" + in.getAbsolutePath() + "' doesn't exist.");
if (!in.isFile())
throw new IOException("File '" + in.getAbsolutePath() + "' exists but is not a file.");
//Charset.defaultCharset().name());//
Scanner scanner = new Scanner(new FileInputStream(in), StandardCharsets.UTF_8.name());
StringBuilder sb = new StringBuilder();
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
sb.append(line).append("\n");
}
scanner.close();
String str = sb.toString();
TextAnnotation ta = taBldr.createTextAnnotation(corpus, in.getName(), str);
View sents = ta.getView(ViewNames.SENTENCE);
logger.info("processing file '{}'; input length is {}", in.getAbsolutePath(), str.length());
// System.err.println("processing file '" + in.getAbsolutePath() + "'..." + " input length: " + str.length());
List<Constituent> toks = ta.getView(ViewNames.TOKENS).getConstituents();
// List<String> outputs = new ArrayList<>();
StringBuilder bldr = new StringBuilder();
for (Constituent sent : sents) {
int index = 1;
for (Constituent tok : toks) {
if (tok.getStartCharOffset() >= sent.getStartCharOffset() && tok.getEndCharOffset() <= sent.getEndCharOffset()) {
bldr.append(Integer.toString(index++)).append("\t").append(tok.getSurfaceForm()).append("\t").append(tok.getStartCharOffset()).append("\t").append(tok.getEndCharOffset()).append(System.lineSeparator());
}
}
// empty line to separate sentences
bldr.append(System.lineSeparator());
}
System.err.println("output length: " + bldr.toString().length());
// LineIO.write(out, outputs);
try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(out)), StandardCharsets.UTF_8.name())) {
writer.write(bldr.toString());
} catch (IOException e) {
logger.error("Can't write to file {}: {}", out, e.getMessage());
e.printStackTrace();
throw e;
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.
the class PathLSTMHandler method addView.
@Override
public void addView(TextAnnotation ta) throws AnnotatorException {
// Check if all required views are present
try {
View srlView = getSRL(ta);
ta.addView(getViewName(), srlView);
} catch (Exception e) {
e.printStackTrace();
throw new AnnotatorException(e.getMessage());
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.
the class LemmatizerTATest method testCreateTextAnnotationLemmaView.
@Test
public void testCreateTextAnnotationLemmaView() {
View lemmaView = null;
TextAnnotation ta = inputTa;
try {
lemmaView = lem.createLemmaView(ta);
} catch (IOException e) {
e.printStackTrace();
fail(e.getMessage());
}
boolean isTested = false;
if (null != lemmaView) {
List<Constituent> spans = lemmaView.getConstituents();
printConstituents(System.out, spans);
// orig 'The'
String the = spans.get(0).getLabel();
// orig 'men'
String CIA = spans.get(1).getLabel();
// orig 'have'
String thought = spans.get(2).getLabel();
// orig 'had'
String had = spans.get(6).getLabel();
// orig 'examinations'
String were = spans.get(15).getLabel();
assertEquals(the, "the");
assertEquals(CIA, "cia");
assertEquals(thought, "think");
assertEquals(had, "have");
assertEquals(were, "be");
isTested = true;
}
assertTrue(isTested);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.
the class NERAnnotatorTest method testTokenization.
/**
* test tokenization produces the correct number of constinuents.
*/
@Test
public void testTokenization() {
TextAnnotation ta = tab.createTextAnnotation(TOKEN_TEST);
View nerView = null;
try {
nerView = getView(ta);
} catch (AnnotatorException e) {
e.printStackTrace();
fail(e.getMessage());
}
assertEquals(nerView.getConstituents().size(), 2);
String tokTestB = "Grigory Pasko, crusading Russian journalist who documented Russian Navy's mishandling of " + "nuclear waste, is released on parole after serving two-thirds of his four-year prison sentence.";
ta = tab.createTextAnnotation(tokTestB);
try {
nerView = getView(ta);
} catch (AnnotatorException e) {
e.printStackTrace();
fail(e.getMessage());
}
assertEquals(3, nerView.getNumberOfConstituents());
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.
the class NERAnnotatorTest method evaluatePerformance.
/**
* Make sure it runs in reasonable time. We will test the performance of the machine we run on
* to get a better measure.
*/
// @Test
public void evaluatePerformance() {
// now do performance.
final int SIZE = 100;
// make sure any lazy loading is done outside the performance test.
TextAnnotation tat = tab.createTextAnnotation(TEST_INPUT);
try {
getView(tat);
} catch (AnnotatorException e) {
e.printStackTrace();
fail(e.getMessage());
}
long expectedPerformance = this.measureMachinePerformance();
logger.info("Expect " + expectedPerformance);
{
TextAnnotation ta = tab.createTextAnnotation(TEST_INPUT);
View view = null;
try {
view = getView(ta);
} catch (AnnotatorException e) {
e.printStackTrace();
fail(e.getMessage());
}
assertTrue(view != null);
}
// start the performance test.
long start = System.currentTimeMillis();
for (int i = 0; i < SIZE; i++) {
TextAnnotation ta = tab.createTextAnnotation(TEST_INPUT);
View view = null;
try {
view = getView(ta);
} catch (AnnotatorException e) {
e.printStackTrace();
fail(e.getMessage());
}
assertTrue(view != null);
for (Constituent c : view.getConstituents()) {
assertTrue("No entity named \"" + c.toString() + "\"", entities.contains(c.toString()));
}
}
start = System.currentTimeMillis() - start;
start /= SIZE;
System.out.printf("For text size = %d, average NER runtime = %d, normalized = %f", TEST_INPUT.length(), start, (double) start / (double) expectedPerformance);
assertTrue(start <= expectedPerformance);
}
Aggregations