Search in sources :

Example 26 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class WebAnnoTsv3WriterTestBase method testTwoSentencesWithNoSpaceInBetween.

@Test
public void testTwoSentencesWithNoSpaceInBetween() throws Exception {
    TypeSystemDescription global = TypeSystemDescriptionFactory.createTypeSystemDescription();
    TypeSystemDescription local = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath("src/test/resources/desc/type/webannoTestTypes.xml");
    TypeSystemDescription merged = CasCreationUtils.mergeTypeSystems(asList(global, local));
    JCas jcas = JCasFactory.createJCas(merged);
    DocumentMetaData.create(jcas).setDocumentId("doc");
    jcas.setDocumentText("onetwo");
    new Token(jcas, 0, 3).addToIndexes();
    new Sentence(jcas, 0, 3).addToIndexes();
    new Token(jcas, 3, 6).addToIndexes();
    new Sentence(jcas, 3, 6).addToIndexes();
    writeAndAssertEquals(jcas);
}
Also used : TypeSystemDescription(org.apache.uima.resource.metadata.TypeSystemDescription) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) Test(org.junit.Test)

Example 27 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class Tsv3XDeserializer method readContent.

private void readContent(LineNumberReader aIn, TsvDocument aDoc) throws IOException {
    StringBuilder text = new StringBuilder();
    State prevState = State.INTER_SENTENCE_SPACE;
    State state = State.INTER_SENTENCE_SPACE;
    StringBuilder sentenceText = new StringBuilder();
    TsvSentence prevSentence = null;
    TsvSentence sentence = null;
    TsvToken token = null;
    String line = aIn.readLine();
    while (!State.END.equals(state)) {
        // These variables are only used in TOKEN and SUBTOKEN states.
        String[] fields = null;
        String id = null;
        String[] offsets = null;
        int begin = -1;
        int end = -1;
        // Determine the status of the current line
        if (startsWith(line, PREFIX_TEXT)) {
            state = State.SENTENCE;
        } else if (line == null) {
            state = State.END;
        } else if (isEmpty(line)) {
            state = State.INTER_SENTENCE_SPACE;
        } else {
            fields = splitPreserveAllTokens(line, FIELD_SEPARATOR);
            // Get token metadata
            id = fields[0];
            offsets = split(fields[1], "-");
            begin = Integer.valueOf(offsets[0]);
            end = Integer.valueOf(offsets[1]);
            // TOKEN or SUBTOKEN?
            if (id.contains(".")) {
                state = State.SUBTOKEN;
            } else {
                state = State.TOKEN;
            }
        }
        // Assert that the order of information in the file is correct
        switch(prevState) {
            case INTER_SENTENCE_SPACE:
                if (!State.SENTENCE.equals(state)) {
                    throw new IOException("Line " + aIn.getLineNumber() + ": Expected sentence header but got [" + state + "]");
                }
                break;
            case SENTENCE:
                if (!(State.SENTENCE.equals(state) || State.TOKEN.equals(state))) {
                    throw new IOException("Line " + aIn.getLineNumber() + ": Expected sentence header or token but got [" + state + "]");
                }
                break;
            case TOKEN:
            case SUBTOKEN:
                if (!(State.INTER_SENTENCE_SPACE.equals(state) || State.END.equals(state) || State.TOKEN.equals(state) || State.SUBTOKEN.equals(state))) {
                    throw new IOException("Line " + aIn.getLineNumber() + ": Expected token, sub-token or sentence break but got [" + state + "]");
                }
                break;
        }
        // Do the actual parsing
        switch(state) {
            case END:
            case INTER_SENTENCE_SPACE:
                // End of sentence action
                // The -1 here is to account for the tailing line break
                sentence.getUimaSentence().setEnd(text.length() - 1);
                sentence.getUimaSentence().addToIndexes();
                prevSentence = sentence;
                sentence = null;
                break;
            case TOKEN:
                // End of sentence header action
                if (State.SENTENCE.equals(prevState)) {
                    // last sentence!
                    if (text.length() > begin) {
                        assert text.length() == begin + 1;
                        assert text.charAt(text.length() - 1) == LINE_BREAK;
                        text.setLength(text.length() - 1);
                    }
                    // the gap.
                    if (text.length() < begin) {
                        text.append(repeat(' ', begin - text.length()));
                    }
                    assert text.length() == begin;
                    assert sentence == null;
                    Sentence uimaSentence = new Sentence(aDoc.getJCas());
                    uimaSentence.setBegin(text.length());
                    sentence = aDoc.createSentence(uimaSentence);
                    text.append(sentenceText);
                    sentenceText.setLength(0);
                }
                // Token parsing action
                Token uimaToken = new Token(aDoc.getJCas(), begin, end);
                uimaToken.addToIndexes();
                token = sentence.createToken(uimaToken);
                // Read annotations from the columns
                parseAnnotations(aDoc, sentence, token, fields);
                break;
            case SUBTOKEN:
                // Read annotations from the columns
                TsvSubToken subToken = token.createSubToken(begin, end);
                parseAnnotations(aDoc, sentence, subToken, fields);
                break;
            case SENTENCE:
                // Header parsing action
                String textFragment = substringAfter(line, "=");
                textFragment = unescapeText(aDoc.getFormatHeader(), textFragment);
                sentenceText.append(textFragment);
                sentenceText.append(LINE_BREAK);
                break;
        }
        prevState = state;
        line = aIn.readLine();
    }
    aDoc.getJCas().setDocumentText(text.toString());
    // After all data has been read, we also add the annotations with disambiguation ID to
    // the CAS indexes. This ensures we only add them after their final begin/end offsets
    // have been determined since most of these annotations are actually multi-token
    // annotations.
    CAS cas = aDoc.getJCas().getCas();
    Set<FeatureStructure> fses = new LinkedHashSet<>();
    for (TsvSentence s : aDoc.getSentences()) {
        for (TsvToken t : s.getTokens()) {
            for (Type type : t.getUimaTypes()) {
                fses.addAll(t.getUimaAnnotations(type));
            }
            for (TsvSubToken st : t.getSubTokens()) {
                for (Type type : st.getUimaTypes()) {
                    fses.addAll(st.getUimaAnnotations(type));
                }
            }
        }
    }
    fses.forEach(cas::addFsToIndexes);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) TsvToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvToken) TsvSubToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) TsvSentence(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSentence) IOException(java.io.IOException) TsvSubToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken) FeatureStructure(org.apache.uima.cas.FeatureStructure) Type(org.apache.uima.cas.Type) LayerType(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.LayerType) CAS(org.apache.uima.cas.CAS) TsvToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvToken) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) TsvSentence(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSentence)

Example 28 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class AutomationPage method actionCompletePreferencesChange.

private void actionCompletePreferencesChange(AjaxRequestTarget aTarget) {
    try {
        AnnotatorState state = getModelObject();
        JCas editorCas = getEditorCas();
        // The number of visible sentences may have changed - let the state recalculate
        // the visible sentences
        Sentence sentence = selectByAddr(editorCas, Sentence.class, state.getFirstVisibleUnitAddress());
        state.setFirstVisibleUnit(sentence);
        SuggestionBuilder builder = new SuggestionBuilder(casStorageService, documentService, correctionDocumentService, curationDocumentService, annotationService, userRepository);
        curationContainer = builder.buildCurationContainer(state);
        setCurationSegmentBeginEnd(editorCas);
        curationContainer.setBratAnnotatorModel(state);
        update(aTarget);
        aTarget.appendJavaScript("Wicket.Window.unloadConfirmation = false;window.location.reload()");
        // Re-render the whole page because the width of the sidebar may have changed
        aTarget.add(this);
    } catch (Exception e) {
        handleException(aTarget, e);
    }
}
Also used : AnnotatorState(de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.AnnotatorState) JCas(org.apache.uima.jcas.JCas) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) SuggestionBuilder(de.tudarmstadt.ukp.clarin.webanno.ui.curation.component.model.SuggestionBuilder) AnnotationException(de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException) UIMAException(org.apache.uima.UIMAException) IOException(java.io.IOException)

Example 29 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class AutomationUtil method generatePredictDocument.

// TODO: rename to predictDocument
public static void generatePredictDocument(MiraTemplate aTemplate, DocumentService aRepository, CorrectionDocumentService aCorrectionDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws IOException, UIMAException, ClassNotFoundException {
    File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
    if (!miraDir.exists()) {
        FileUtils.forceMkdir(miraDir);
    }
    User user = aUserDao.getCurrentUser();
    AnnotationFeature feature = aTemplate.getTrainFeature();
    AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(feature.getLayer());
    for (SourceDocument document : aRepository.listSourceDocuments(feature.getProject())) {
        File predFile = new File(miraDir, document.getId() + ".pred.ft");
        BufferedWriter predOut = new BufferedWriter(new FileWriter(predFile));
        JCas jCas;
        try {
            jCas = aCorrectionDocumentService.readCorrectionCas(document);
        } catch (Exception e) {
            AnnotationDocument annoDoc = aRepository.createOrGetAnnotationDocument(document, user);
            jCas = aRepository.readAnnotationCas(annoDoc);
        }
        for (Sentence sentence : select(jCas, Sentence.class)) {
            predOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
        }
        predOut.close();
    }
}
Also used : User(de.tudarmstadt.ukp.clarin.webanno.security.model.User) FileWriter(java.io.FileWriter) SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) JCas(org.apache.uima.jcas.JCas) AnnotationDocument(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument) File(java.io.File) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) NoResultException(javax.persistence.NoResultException) AnnotationException(de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException) CASException(org.apache.uima.cas.CASException) UIMAException(org.apache.uima.UIMAException) DataRetrievalFailureException(org.springframework.dao.DataRetrievalFailureException) IOException(java.io.IOException) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature) AutomationTypeAdapter(de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter) BufferedWriter(java.io.BufferedWriter)

Example 30 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class AutomationUtil method generateTrainDocument.

public static void generateTrainDocument(MiraTemplate aTemplate, DocumentService aRepository, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao, boolean aBase) throws IOException, UIMAException, ClassNotFoundException, AutomationException {
    LOG.info("Starting to generate training document");
    File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
    if (!miraDir.exists()) {
        FileUtils.forceMkdir(miraDir);
    }
    AnnotationFeature feature = aTemplate.getTrainFeature();
    boolean documentChanged = false;
    // A. training document for other train layers were changed
    for (AnnotationFeature otherrFeature : aTemplate.getOtherFeatures()) {
        for (TrainingDocument document : aAutomationService.listTrainingDocuments(aTemplate.getTrainFeature().getProject())) {
            if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(otherrFeature)) {
                documentChanged = true;
                break;
            }
        }
    }
    // B. Training document for the main training layer were changed
    for (TrainingDocument document : aAutomationService.listTrainingDocuments(feature.getProject())) {
        if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(feature))) {
            documentChanged = true;
            break;
        }
    }
    // C. New Curation document arrives
    if (aRepository.listSourceDocuments(feature.getProject()).size() > 0) {
        documentChanged = true;
    }
    // D. tab-sep training documents
    for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
        if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) {
            documentChanged = true;
            break;
        }
    }
    if (!documentChanged) {
        return;
    }
    File trainFile;
    if (aBase) {
        trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.ft");
    } else {
        trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.base");
    }
    AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
    BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile));
    AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(feature.getLayer());
    // Training documents (Curated or webanno-compatible imported ones - read using UIMA)
    List<TrainingDocument> trainingDocuments = aAutomationService.listTrainingDocuments(feature.getProject());
    int trainingDocsCount = 0;
    for (TrainingDocument trainingDocument : trainingDocuments) {
        if ((trainingDocument.getFeature() != null && trainingDocument.getFeature().equals(feature)) && !trainingDocument.getFormat().equals(WebAnnoConst.TAB_SEP)) {
            JCas jCas = aAutomationService.readTrainingAnnotationCas(trainingDocument);
            for (Sentence sentence : select(jCas, Sentence.class)) {
                if (aBase) {
                    // base training document
                    trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
                } else {
                    // training document with other features
                    trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
                }
            }
            trainingDocument.setProcessed(!aBase);
            if (!aBase) {
                status.setTrainDocs(status.getTrainDocs() - 1);
            }
        }
    }
    // for curated docuemnts
    List<SourceDocument> sourceDocuments = aRepository.listSourceDocuments(feature.getProject());
    for (SourceDocument sourceDocument : sourceDocuments) {
        if (sourceDocument.getState().equals(SourceDocumentState.CURATION_FINISHED)) {
            JCas jCas = aCurationDocumentService.readCurationCas(sourceDocument);
            for (Sentence sentence : select(jCas, Sentence.class)) {
                if (aBase) {
                    // base training document
                    trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
                } else {
                    // training document with other features
                    trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
                }
            }
            if (!aBase) {
                status.setTrainDocs(status.getTrainDocs() - 1);
            }
        }
        trainingDocsCount++;
        LOG.info("Processed source document " + trainingDocsCount + " of " + trainingDocuments.size());
    }
    // Tab-sep documents to be used as a target layer train document
    int goldStandardDocsCounter = 0;
    List<TrainingDocument> goldStandardDocs = aAutomationService.listTabSepDocuments(feature.getProject());
    for (TrainingDocument document : goldStandardDocs) {
        if (document.getFormat().equals(WebAnnoConst.TAB_SEP) && document.getFeature() != null && document.getFeature().equals(feature)) {
            File tabSepFile = new File(aAutomationService.getDocumentFolder(document), document.getName());
            LineIterator it = IOUtils.lineIterator(new FileReader(tabSepFile));
            while (it.hasNext()) {
                String line = it.next();
                if (line.trim().equals("")) {
                    trainOut.append("\n");
                } else {
                    StringTokenizer st = new StringTokenizer(line, "\t");
                    if (st.countTokens() != 2) {
                        trainOut.close();
                        throw new AutomationException("This is not a valid TAB-SEP document");
                    }
                    if (aBase) {
                        trainOut.append(getMiraLineForTabSep(st.nextToken(), ""));
                    } else {
                        trainOut.append(getMiraLineForTabSep(st.nextToken(), st.nextToken()));
                    }
                }
            }
        }
        goldStandardDocsCounter++;
        LOG.info("Processed gold standard document " + goldStandardDocsCounter + " of " + goldStandardDocs.size());
    }
    trainOut.close();
    LOG.info("Completed generating training document");
}
Also used : FileWriter(java.io.FileWriter) SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) JCas(org.apache.uima.jcas.JCas) LineIterator(org.apache.commons.io.LineIterator) AutomationStatus(de.tudarmstadt.ukp.clarin.webanno.automation.model.AutomationStatus) BufferedWriter(java.io.BufferedWriter) AutomationTypeAdapter(de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter) StringTokenizer(java.util.StringTokenizer) FileReader(java.io.FileReader) File(java.io.File) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature) TrainingDocument(de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument)

Aggregations

Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)90 JCas (org.apache.uima.jcas.JCas)41 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)34 ArrayList (java.util.ArrayList)22 AnnotatorState (de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.AnnotatorState)14 Type (org.apache.uima.cas.Type)12 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)12 IOException (java.io.IOException)9 SourceDocument (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument)8 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)8 Test (org.junit.Test)8 HashMap (java.util.HashMap)7 TokenBuilder (org.apache.uima.fit.testing.factory.TokenBuilder)7 AnnotationException (de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException)6 WebAnnoCasUtil.getFirstSentence (de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.getFirstSentence)6 AnnotationDocument (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument)6 AnnotationFeature (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)6 FrequencyDistribution (de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)6 CASException (org.apache.uima.cas.CASException)6 AutomationTypeAdapter (de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter)5