Search in sources :

Example 16 with DocumentMetaData

use of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData in project webanno by webanno.

the class WebannoTsv3Reader method convertToCas.

public void convertToCas(JCas aJCas, InputStream aIs, String aEncoding) throws IOException {
    DocumentMetaData documentMetadata = DocumentMetaData.get(aJCas);
    fileName = documentMetadata.getDocumentTitle();
    // setLayerAndFeature(aJCas, aIs, aEncoding);
    setAnnotations(aJCas, aIs, aEncoding);
    aJCas.setDocumentText(coveredText.toString());
}
Also used : DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData)

Example 17 with DocumentMetaData

use of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData in project webanno by webanno.

the class TcfWriter method process.

@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
    InputStream docIS = null;
    try {
        boolean writeWithoutMerging = true;
        if (merge) {
            OutputStream docOS = null;
            try {
                docOS = getOutputStream(aJCas, filenameSuffix);
                // Get the original TCF file and preserve it
                DocumentMetaData documentMetadata = DocumentMetaData.get(aJCas);
                URL filePathUrl = new URL(documentMetadata.getDocumentUri());
                try {
                    docIS = filePathUrl.openStream();
                    try {
                        getLogger().debug("Merging with [" + documentMetadata.getDocumentUri() + "]");
                        casToTcfWriter(docIS, aJCas, docOS);
                        writeWithoutMerging = false;
                    }// Workaround: catch all exceptions
                     catch (Exception ex) {
                        getLogger().debug("Source file is not TCF: " + ex.getMessage());
                    }
                } catch (IOException e) {
                    getLogger().debug("Cannot open source file to merge with: " + e.getMessage());
                }
            } finally {
                closeQuietly(docOS);
            }
        } else {
            getLogger().debug("Merging disabled");
        }
        // If merging failed or is disabled, go on without merging
        if (writeWithoutMerging) {
            OutputStream docOS = null;
            try {
                docOS = getOutputStream(aJCas, filenameSuffix);
                casToTcfWriter(aJCas, docOS);
            } finally {
                closeQuietly(docOS);
            }
        }
    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    } finally {
        closeQuietly(docIS);
    }
}
Also used : InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData) IOException(java.io.IOException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) URL(java.net.URL) IOException(java.io.IOException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) WLFormatException(eu.clarin.weblicht.wlfxb.io.WLFormatException)

Example 18 with DocumentMetaData

use of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData in project webanno by webanno.

the class TeiReaderTest method testTeiReader.

@Test
@Ignore("No TEI yet to opensource ")
public void testTeiReader() throws Exception {
    CollectionReaderDescription reader = createReaderDescription(TeiReader.class, TeiReader.PARAM_LANGUAGE, "en", TeiReader.PARAM_SOURCE_LOCATION, "classpath:/local/", TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" });
    String firstSentence = "70 I DAG.";
    for (JCas jcas : new JCasIterable(reader)) {
        DocumentMetaData meta = DocumentMetaData.get(jcas);
        String text = jcas.getDocumentText();
        System.out.printf("%s - %d%n", meta.getDocumentId(), text.length());
        System.out.println(jcas.getDocumentLanguage());
        assertEquals(2235, JCasUtil.select(jcas, Token.class).size());
        assertEquals(745, JCasUtil.select(jcas, POS.class).size());
        assertEquals(745, JCasUtil.select(jcas, Lemma.class).size());
        assertEquals(0, JCasUtil.select(jcas, NamedEntity.class).size());
        assertEquals(30, JCasUtil.select(jcas, Sentence.class).size());
        assertEquals(firstSentence, JCasUtil.select(jcas, Sentence.class).iterator().next().getCoveredText());
    }
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) JCasIterable(org.apache.uima.fit.pipeline.JCasIterable) JCas(org.apache.uima.jcas.JCas) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 19 with DocumentMetaData

use of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData in project webanno by webanno.

the class WebannoTsv1Reader method convertToCas.

public void convertToCas(JCas aJCas, InputStream aIs, String aEncoding) throws IOException {
    StringBuilder text = new StringBuilder();
    Map<Integer, String> tokens = new HashMap<>();
    Map<Integer, String> pos = new HashMap<>();
    Map<Integer, String> lemma = new HashMap<>();
    Map<Integer, String> namedEntity = new HashMap<>();
    Map<Integer, String> dependencyFunction = new HashMap<>();
    Map<Integer, Integer> dependencyDependent = new HashMap<>();
    List<Integer> firstTokenInSentence = new ArrayList<>();
    DocumentMetaData documentMetadata = DocumentMetaData.get(aJCas);
    fileName = documentMetadata.getDocumentTitle();
    setAnnotations(aIs, aEncoding, text, tokens, pos, lemma, namedEntity, dependencyFunction, dependencyDependent, firstTokenInSentence);
    aJCas.setDocumentText(text.toString());
    Map<String, Token> tokensStored = new HashMap<>();
    createToken(aJCas, text, tokens, pos, lemma, tokensStored);
    createNamedEntity(namedEntity, aJCas, tokens, tokensStored);
    createDependency(aJCas, tokens, dependencyFunction, dependencyDependent, tokensStored);
    createSentence(aJCas, firstTokenInSentence, tokensStored);
}
Also used : HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData)

Example 20 with DocumentMetaData

use of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData in project dkpro-tc by dkpro.

the class PairReader_ImplBase method createMetaData.

protected void createMetaData(JCas jcas, String collectionId, String docId, String docTitle) {
    DocumentMetaData metaData = DocumentMetaData.create(jcas);
    metaData.setCollectionId(collectionId);
    metaData.setDocumentBaseUri("");
    metaData.setDocumentUri("/" + docId);
    metaData.setDocumentTitle(docTitle);
    metaData.setDocumentId(docId);
}
Also used : DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData)

Aggregations

DocumentMetaData (de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData)27 JCas (org.apache.uima.jcas.JCas)7 ArrayList (java.util.ArrayList)6 IOException (java.io.IOException)5 AnalysisEngine (org.apache.uima.analysis_engine.AnalysisEngine)4 TextClassificationOutcome (org.dkpro.tc.api.type.TextClassificationOutcome)4 Project (de.tudarmstadt.ukp.clarin.webanno.model.Project)3 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)3 HashMap (java.util.HashMap)3 HashSet (java.util.HashSet)3 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)3 CAS (org.apache.uima.cas.CAS)3 JCasId (org.dkpro.tc.api.type.JCasId)3 TextClassificationTarget (org.dkpro.tc.api.type.TextClassificationTarget)3 AnnotationDocument (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument)2 SourceDocument (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument)2 User (de.tudarmstadt.ukp.clarin.webanno.security.model.User)2 File (java.io.File)2 FileNotFoundException (java.io.FileNotFoundException)2 LinkedHashMap (java.util.LinkedHashMap)2