Search in sources :

Example 21 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class WebAnnoCasUtil method isSameSentence.

/**
 * Check if the two given offsets are within the same sentence.
 *
 * @param aJcas
 *            the JCAs.
 * @param aReferenceOffset
 *            the reference offset.
 * @param aCompareOffset
 *            the comparison offset.
 * @return if the two offsets are within the same sentence.
 */
public static boolean isSameSentence(JCas aJcas, int aReferenceOffset, int aCompareOffset) {
    // Trivial case
    if (aReferenceOffset == aCompareOffset) {
        return true;
    }
    int offset1 = Math.min(aReferenceOffset, aCompareOffset);
    int offset2 = Math.max(aReferenceOffset, aCompareOffset);
    // Scanning through sentences
    Iterator<Sentence> si = JCasUtil.iterator(aJcas, Sentence.class);
    while (si.hasNext()) {
        Sentence s = si.next();
        if (s.getBegin() <= offset1 && offset1 <= s.getEnd()) {
            return s.getBegin() <= offset2 && offset2 <= s.getEnd();
        }
        // offset, we will never again find a sentence that contains it.
        if (offset1 < s.getBegin()) {
            return false;
        }
    }
    return false;
}
Also used : Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 22 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class WebAnnoCasUtil method getSentenceNumber.

/**
 * Get the sentence number at this specific position
 *
 * @param aJcas
 *            the JCas.
 * @param aBeginOffset
 *            the begin offset.
 * @return the sentence number.
 */
public static int getSentenceNumber(JCas aJcas, int aBeginOffset) {
    int sentenceNumber = 0;
    Collection<Sentence> sentences = select(aJcas, Sentence.class);
    if (sentences.isEmpty()) {
        throw new IndexOutOfBoundsException("No sentences");
    }
    for (Sentence sentence : select(aJcas, Sentence.class)) {
        if (sentence.getBegin() <= aBeginOffset && aBeginOffset <= sentence.getEnd()) {
            sentenceNumber++;
            break;
        }
        sentenceNumber++;
    }
    return sentenceNumber;
}
Also used : Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 23 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class RemoteApiController2 method createCompatibleCas.

private JCas createCompatibleCas(long aProjectId, long aDocumentId, MultipartFile aFile, Optional<String> aFormat) throws RemoteApiException, ClassNotFoundException, IOException, UIMAException {
    Project project = getProject(aProjectId);
    SourceDocument document = getDocument(project, aDocumentId);
    // Check if the format is supported
    String format = aFormat.orElse(FORMAT_DEFAULT);
    Map<String, Class<CollectionReader>> readableFormats = importExportService.getReadableFormats();
    if (readableFormats.get(format) == null) {
        throw new UnsupportedFormatException("Format [%s] not supported. Acceptable formats are %s.", format, readableFormats.keySet());
    }
    // Convert the uploaded annotation document into a CAS
    File tmpFile = null;
    JCas annotationCas;
    try {
        tmpFile = File.createTempFile("upload", ".bin");
        aFile.transferTo(tmpFile);
        annotationCas = importExportService.importCasFromFile(tmpFile, project, format);
    } finally {
        if (tmpFile != null) {
            FileUtils.forceDelete(tmpFile);
        }
    }
    // Check if the uploaded file is compatible with the source document. They are compatible
    // if the text is the same and if all the token and sentence annotations have the same
    // offsets.
    JCas initialCas = documentService.createOrReadInitialCas(document);
    String initialText = initialCas.getDocumentText();
    String annotationText = annotationCas.getDocumentText();
    // If any of the texts contains tailing line breaks, we ignore that. We assume at the moment
    // that nobody will have created annotations over that trailing line breaks.
    initialText = StringUtils.chomp(initialText);
    annotationText = StringUtils.chomp(annotationText);
    if (ObjectUtils.notEqual(initialText, annotationText)) {
        int diffIndex = StringUtils.indexOfDifference(initialText, annotationText);
        String expected = initialText.substring(diffIndex, Math.min(initialText.length(), diffIndex + 20));
        String actual = annotationText.substring(diffIndex, Math.min(annotationText.length(), diffIndex + 20));
        throw new IncompatibleDocumentException("Text of annotation document does not match text of source document at offset " + "[%d]. Expected [%s] but found [%s].", diffIndex, expected, actual);
    }
    // Just in case we really had to chomp off a trailing line break from the annotation CAS,
    // make sure we copy over the proper text from the initial CAS
    // NOT AT HOME THIS YOU SHOULD TRY
    // SETTING THE SOFA STRING FORCEFULLY FOLLOWING THE DARK SIDE IS!
    forceSetFeatureValue(annotationCas.getSofa(), CAS.FEATURE_BASE_NAME_SOFASTRING, initialCas.getDocumentText());
    FSUtil.setFeature(annotationCas.getDocumentAnnotationFs(), CAS.FEATURE_BASE_NAME_END, initialCas.getDocumentText().length());
    Collection<Sentence> annotationSentences = select(annotationCas, Sentence.class);
    Collection<Sentence> initialSentences = select(initialCas, Sentence.class);
    if (annotationSentences.size() != initialSentences.size()) {
        throw new IncompatibleDocumentException("Expected [%d] sentences, but annotation document contains [%d] sentences.", initialSentences.size(), annotationSentences.size());
    }
    assertCompatibleOffsets(initialSentences, annotationSentences);
    Collection<Token> annotationTokens = select(annotationCas, Token.class);
    Collection<Token> initialTokens = select(initialCas, Token.class);
    if (annotationTokens.size() != initialTokens.size()) {
        throw new IncompatibleDocumentException("Expected [%d] sentences, but annotation document contains [%d] sentences.", initialSentences.size(), annotationSentences.size());
    }
    assertCompatibleOffsets(initialTokens, annotationTokens);
    return annotationCas;
}
Also used : SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) IncompatibleDocumentException(de.tudarmstadt.ukp.clarin.webanno.webapp.remoteapi.v2.exception.IncompatibleDocumentException) RProject(de.tudarmstadt.ukp.clarin.webanno.webapp.remoteapi.v2.model.RProject) Project(de.tudarmstadt.ukp.clarin.webanno.model.Project) UnsupportedFormatException(de.tudarmstadt.ukp.clarin.webanno.webapp.remoteapi.v2.exception.UnsupportedFormatException) File(java.io.File) MultipartFile(org.springframework.web.multipart.MultipartFile) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 24 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class WebAnnoTsv3WriterTestBase method makeJCasTwoSentences.

private static JCas makeJCasTwoSentences() throws UIMAException {
    JCas jcas = makeJCas();
    TokenBuilder<Token, Sentence> tb = new TokenBuilder<>(Token.class, Sentence.class);
    tb.buildTokens(jcas, "He loves her .\nShe loves him not .");
    assertEquals(2, select(jcas, Sentence.class).size());
    return jcas;
}
Also used : TokenBuilder(org.apache.uima.fit.testing.factory.TokenBuilder) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 25 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class WebAnnoTsv3WriterTestBase method makeJCasOneSentence.

private static JCas makeJCasOneSentence(String aText) throws UIMAException {
    JCas jcas = makeJCas();
    TokenBuilder<Token, Sentence> tb = new TokenBuilder<>(Token.class, Sentence.class);
    tb.buildTokens(jcas, aText);
    // sentence break
    for (Sentence s : select(jcas, Sentence.class)) {
        s.removeFromIndexes();
    }
    // Add a new sentence covering the whole text
    new Sentence(jcas, 0, jcas.getDocumentText().length()).addToIndexes();
    return jcas;
}
Also used : TokenBuilder(org.apache.uima.fit.testing.factory.TokenBuilder) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Aggregations

Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)90 JCas (org.apache.uima.jcas.JCas)41 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)34 ArrayList (java.util.ArrayList)22 AnnotatorState (de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.AnnotatorState)14 Type (org.apache.uima.cas.Type)12 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)12 IOException (java.io.IOException)9 SourceDocument (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument)8 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)8 Test (org.junit.Test)8 HashMap (java.util.HashMap)7 TokenBuilder (org.apache.uima.fit.testing.factory.TokenBuilder)7 AnnotationException (de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException)6 WebAnnoCasUtil.getFirstSentence (de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.getFirstSentence)6 AnnotationDocument (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument)6 AnnotationFeature (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)6 FrequencyDistribution (de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)6 CASException (org.apache.uima.cas.CASException)6 AutomationTypeAdapter (de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter)5