Search in sources :

Example 11 with ContentInfoSet

use of de.catma.document.source.ContentInfoSet in project catma by forTEXT.

the class GitSourceDocumentHandlerTest method create.

@Test
public void create() throws Exception {
    File originalSourceDocument = new File("testdocs/rose_for_emily.pdf");
    File convertedSourceDocument = new File("testdocs/rose_for_emily.txt");
    FileInputStream originalSourceDocumentStream = new FileInputStream(originalSourceDocument);
    FileInputStream convertedSourceDocumentStream = new FileInputStream(convertedSourceDocument);
    IndexInfoSet indexInfoSet = new IndexInfoSet();
    indexInfoSet.setLocale(Locale.ENGLISH);
    ContentInfoSet contentInfoSet = new ContentInfoSet("William Faulkner", "", "", "A Rose for Emily");
    TechInfoSet techInfoSet = new TechInfoSet(FileType.TEXT, StandardCharsets.UTF_8, FileOSType.DOS, 705211438L);
    SourceDocumentInfo sourceDocumentInfo = new SourceDocumentInfo(indexInfoSet, contentInfoSet, techInfoSet);
    Map<String, List<TermInfo>> terms = new TermExtractor(IOUtils.toString(convertedSourceDocumentStream, techInfoSet.getCharset()), new ArrayList<>(), new ArrayList<>(), indexInfoSet.getLocale()).getTerms();
    // need to re-instantiate the stream, otherwise an empty file will be written later on (FileInputStream does not support `reset`)
    convertedSourceDocumentStream = new FileInputStream(convertedSourceDocument);
    String sourceDocumentUuid = new IDGenerator().generateDocumentId();
    // GraphWorktreeProject.TOKENIZED_FILE_EXTENSION
    String tokenizedSourceDocumentFileName = sourceDocumentUuid + "." + "json";
    try (ILocalGitRepositoryManager jGitRepoManager = new JGitRepoManager(CATMAPropertyKey.GitBasedRepositoryBasePath.getValue(), gitlabManagerRestricted.getUser())) {
        directoriesToDeleteOnTearDown.add(jGitRepoManager.getRepositoryBasePath());
        BackgroundService mockBackgroundService = mock(BackgroundService.class);
        EventBus mockEventBus = mock(EventBus.class);
        GitProjectManager gitProjectManager = new GitProjectManager(CATMAPropertyKey.GitBasedRepositoryBasePath.getValue(), gitlabManagerRestricted, // noop deletion handler
        (projectId) -> {
        }, mockBackgroundService, mockEventBus);
        String projectId = gitProjectManager.create("Test CATMA Project", "This is a test CATMA project");
        // we don't add the projectId to projectsToDeleteOnTearDown as deletion of the user will take care of that for us
        // the JGitRepoManager instance should always be in a detached state after GitProjectManager calls return
        assertFalse(jGitRepoManager.isAttached());
        GitSourceDocumentHandler gitSourceDocumentHandler = new GitSourceDocumentHandler(jGitRepoManager, gitlabManagerRestricted, new UsernamePasswordCredentialsProvider("oauth2", gitlabManagerRestricted.getPassword()));
        String revisionHash = gitSourceDocumentHandler.create(projectId, sourceDocumentUuid, originalSourceDocumentStream, originalSourceDocument.getName(), convertedSourceDocumentStream, convertedSourceDocument.getName(), terms, tokenizedSourceDocumentFileName, sourceDocumentInfo);
        assertNotNull(revisionHash);
        // the JGitRepoManager instance should always be in a detached state after GitSourceDocumentHandler calls return
        assertFalse(jGitRepoManager.isAttached());
        File expectedRepoPath = Paths.get(jGitRepoManager.getRepositoryBasePath().getPath(), projectId, sourceDocumentUuid).toFile();
        assert expectedRepoPath.exists();
        assert expectedRepoPath.isDirectory();
        assert Arrays.asList(expectedRepoPath.list()).contains("rose_for_emily.pdf");
        assert Arrays.asList(expectedRepoPath.list()).contains("rose_for_emily.txt");
        assert FileUtils.contentEquals(originalSourceDocument, new File(expectedRepoPath, "rose_for_emily.pdf"));
        assert FileUtils.contentEquals(convertedSourceDocument, new File(expectedRepoPath, "rose_for_emily.txt"));
        assert Arrays.asList(expectedRepoPath.list()).contains("header.json");
        String expectedSerializedSourceDocumentInfo = "" + "{\n" + "  \"gitContentInfoSet\": {\n" + "    \"author\": \"William Faulkner\",\n" + "    \"description\": \"\",\n" + "    \"publisher\": \"\",\n" + "    \"title\": \"A Rose for Emily\"\n" + "  },\n" + "  \"gitIndexInfoSet\": {\n" + "    \"locale\": \"en\",\n" + "    \"unseparableCharacterSequences\": [],\n" + "    \"userDefinedSeparatingCharacters\": []\n" + "  },\n" + "  \"gitTechInfoSet\": {\n" + "    \"charset\": \"UTF-8\",\n" + "    \"checksum\": 705211438,\n" + "    \"fileName\": null,\n" + "    \"fileOSType\": \"DOS\",\n" + "    \"fileType\": \"TEXT\",\n" + "    \"mimeType\": \"text/plain\",\n" + "    \"uri\": null\n" + "  }\n" + "}";
        assertEquals(expectedSerializedSourceDocumentInfo, FileUtils.readFileToString(new File(expectedRepoPath, "header.json"), StandardCharsets.UTF_8));
    }
}
Also used : UsernamePasswordCredentialsProvider(org.eclipse.jgit.transport.UsernamePasswordCredentialsProvider) BackgroundService(de.catma.backgroundservice.BackgroundService) SourceDocumentInfo(de.catma.document.source.SourceDocumentInfo) ILocalGitRepositoryManager(de.catma.repository.git.interfaces.ILocalGitRepositoryManager) JGitRepoManager(de.catma.repository.git.managers.JGitRepoManager) TermExtractor(de.catma.indexer.TermExtractor) EventBus(com.google.common.eventbus.EventBus) FileInputStream(java.io.FileInputStream) ContentInfoSet(de.catma.document.source.ContentInfoSet) IndexInfoSet(de.catma.document.source.IndexInfoSet) TechInfoSet(de.catma.document.source.TechInfoSet) File(java.io.File) IDGenerator(de.catma.util.IDGenerator) GitLabServerManagerTest(de.catma.repository.git.managers.GitLabServerManagerTest) Test(org.junit.jupiter.api.Test)

Example 12 with ContentInfoSet

use of de.catma.document.source.ContentInfoSet in project catma by forTEXT.

the class JsonLdWebAnnotationTest method getJsonLdWebAnnotation.

/**
 * @return a HashMap<String, Object> with these keys:
 *         'jsonLdWebAnnotation' - for the JsonLdWebAnnotation object
 *         'projectUuid'
 *         --- following additional keys which are to be used when formatting EXPECTED_SERIALIZED_ANNOTATION ---:
 *         projectRootRepositoryName, tagsetDefinitionUuid, tagDefinitionUuid, userPropertyDefinitionUuid,
 *         systemPropertyDefinitionUuid, userMarkupCollectionUuid, tagInstanceUuid, sourceDocumentUuid
 */
public static HashMap<String, Object> getJsonLdWebAnnotation(JGitRepoManager jGitRepoManager, IRemoteGitServerManager gitLabServerManager, de.catma.user.User catmaUser) throws Exception {
    try (JGitRepoManager localJGitRepoManager = jGitRepoManager) {
        // caller should do the following:
        // this.directoriesToDeleteOnTearDown.add(localJGitRepoManager.getRepositoryBasePath());
        // create project
        GitProjectManager gitProjectManager = new GitProjectManager(RepositoryPropertyKey.GitBasedRepositoryBasePath.getValue(), UserIdentification.userToMap(catmaUser.getIdentifier()));
        String projectId = gitProjectManager.create("Test CATMA Project", "This is a test CATMA project");
        // caller should do the following:
        // this.projectsToDeleteOnTearDown.add(projectId);
        GitProjectHandler gitProjectHandler = new GitProjectHandler(null, projectId, jGitRepoManager, gitLabServerManager);
        // add new tagset to project
        String tagsetId = gitProjectHandler.createTagset(null, "Test Tagset", null);
        // add new source document to project
        File originalSourceDocument = new File("testdocs/rose_for_emily.pdf");
        File convertedSourceDocument = new File("testdocs/rose_for_emily.txt");
        FileInputStream originalSourceDocumentStream = new FileInputStream(originalSourceDocument);
        FileInputStream convertedSourceDocumentStream = new FileInputStream(convertedSourceDocument);
        IndexInfoSet indexInfoSet = new IndexInfoSet();
        indexInfoSet.setLocale(Locale.ENGLISH);
        ContentInfoSet contentInfoSet = new ContentInfoSet("William Faulkner", "", "", "A Rose for Emily");
        TechInfoSet techInfoSet = new TechInfoSet(FileType.TEXT, StandardCharsets.UTF_8, FileOSType.DOS, 705211438L);
        SourceDocumentInfo sourceDocumentInfo = new SourceDocumentInfo(indexInfoSet, contentInfoSet, techInfoSet);
        String sourceDocumentId = gitProjectHandler.createSourceDocument(null, originalSourceDocumentStream, originalSourceDocument.getName(), convertedSourceDocumentStream, convertedSourceDocument.getName(), null, null, sourceDocumentInfo);
        // add new markup collection to project
        String markupCollectionId = gitProjectHandler.createMarkupCollection(null, "Test Markup Collection", null, sourceDocumentId, "fakeSourceDocumentVersion");
        // commit the changes to the project root repo (addition of tagset, source document and markup collection
        // submodules)
        String projectRootRepositoryName = GitProjectManager.getProjectRootRepositoryName(projectId);
        localJGitRepoManager.open(projectId, projectRootRepositoryName);
        localJGitRepoManager.commit(String.format("Adding new tagset %s, source document %s and markup collection %s", tagsetId, sourceDocumentId, markupCollectionId), "Test Committer", "testcommitter@catma.de");
        // can't call open on an attached instance
        localJGitRepoManager.detach();
        // construct TagDefinition object
        IDGenerator idGenerator = new IDGenerator();
        List<String> systemPropertyPossibleValues = Arrays.asList("SYSPROP_VAL_1", "SYSPROP_VAL_2");
        PropertyDefinition systemPropertyDefinition = new PropertyDefinition(PropertyDefinition.SystemPropertyName.catma_displaycolor.toString(), systemPropertyPossibleValues);
        List<String> userPropertyPossibleValues = Arrays.asList("UPROP_VAL_1", "UPROP_VAL_2");
        PropertyDefinition userPropertyDefinition = new PropertyDefinition("UPROP_DEF", userPropertyPossibleValues);
        String tagDefinitionUuid = idGenerator.generate();
        TagDefinition tagDefinition = new TagDefinition(null, tagDefinitionUuid, "TAG_DEF", new Version(), null, null, tagsetId);
        tagDefinition.addSystemPropertyDefinition(systemPropertyDefinition);
        tagDefinition.addUserDefinedPropertyDefinition(userPropertyDefinition);
        // call createTagDefinition
        // NB: in this case we know that the tagset submodule is on the master branch tip, ie: not in a detached
        // head state, so it's safe to make changes to the submodule and commit them
        // TODO: createTagDefinition should probably do some validation and fail fast if the tagset submodule is in
        // a detached head state - in that case the submodule would need to be updated first
        // see the "Updating a submodule in-place in the container" scenario at
        // https://medium.com/@porteneuve/mastering-git-submodules-34c65e940407
        GitTagsetHandler gitTagsetHandler = new GitTagsetHandler(localJGitRepoManager, gitLabServerManager);
        String returnedTagDefinitionId = gitTagsetHandler.createOrUpdateTagDefinition(projectId, tagsetId, tagDefinition);
        assertNotNull(returnedTagDefinitionId);
        assert returnedTagDefinitionId.startsWith("CATMA_");
        // the JGitRepoManager instance should always be in a detached state after GitTagsetHandler calls return
        assertFalse(localJGitRepoManager.isAttached());
        assertEquals(tagDefinitionUuid, returnedTagDefinitionId);
        // commit and push submodule changes (creation of tag definition)
        // TODO: add methods to JGitRepoManager to do this
        localJGitRepoManager.open(projectId, projectRootRepositoryName);
        Repository projectRootRepository = localJGitRepoManager.getGitApi().getRepository();
        String tagsetSubmodulePath = String.format("%s/%s", GitProjectHandler.TAGSET_SUBMODULES_DIRECTORY_NAME, tagsetId);
        Repository tagsetSubmoduleRepository = SubmoduleWalk.getSubmoduleRepository(projectRootRepository, tagsetSubmodulePath);
        Git submoduleGit = new Git(tagsetSubmoduleRepository);
        submoduleGit.add().addFilepattern(tagDefinitionUuid).call();
        submoduleGit.commit().setMessage(String.format("Adding tag definition %s", tagDefinitionUuid)).setCommitter("Test Committer", "testcommitter@catma.de").call();
        submoduleGit.push().setCredentialsProvider(new UsernamePasswordCredentialsProvider(gitLabServerManager.getUsername(), gitLabServerManager.getPassword())).call();
        tagsetSubmoduleRepository.close();
        submoduleGit.close();
        // commit and push project root repo changes (update of tagset submodule)
        localJGitRepoManager.getGitApi().add().addFilepattern(tagsetSubmodulePath).call();
        localJGitRepoManager.commit(String.format("Updating tagset %s", tagsetId), "Test Committer", "testcommitter@catma.de");
        // construct TagInstance object
        Property systemProperty = new Property(systemPropertyDefinition, Collections.singleton("SYSPROP_VAL_1"));
        Property userProperty = new Property(userPropertyDefinition, Collections.singleton("UPROP_VAL_2"));
        String tagInstanceUuid = idGenerator.generate();
        TagInstance tagInstance = new TagInstance(tagInstanceUuid, tagDefinition);
        tagInstance.addSystemProperty(systemProperty);
        tagInstance.addUserDefinedProperty(userProperty);
        // construct JsonLdWebAnnotation object
        String sourceDocumentUri = String.format("http://catma.de/gitlab/%s/%s/%s", projectRootRepositoryName, GitProjectHandler.SOURCE_DOCUMENT_SUBMODULES_DIRECTORY_NAME, sourceDocumentId);
        Range range1 = new Range(12, 18);
        Range range2 = new Range(41, 47);
        List<TagReference> tagReferences = new ArrayList<>(Arrays.asList(new TagReference(tagInstance, sourceDocumentUri, range1, markupCollectionId), new TagReference(tagInstance, sourceDocumentUri, range2, markupCollectionId)));
        JsonLdWebAnnotation jsonLdWebAnnotation = new JsonLdWebAnnotation("http://catma.de/gitlab", projectId, tagReferences);
        HashMap<String, Object> returnValue = new HashMap<>();
        returnValue.put("jsonLdWebAnnotation", jsonLdWebAnnotation);
        returnValue.put("projectRootRepositoryName", projectRootRepositoryName);
        returnValue.put("projectUuid", projectId);
        returnValue.put("tagsetDefinitionUuid", tagsetId);
        returnValue.put("tagDefinitionUuid", tagDefinitionUuid);
        returnValue.put("userMarkupCollectionUuid", markupCollectionId);
        returnValue.put("tagInstanceUuid", tagInstanceUuid);
        returnValue.put("sourceDocumentUuid", sourceDocumentId);
        return returnValue;
    }
}
Also used : TagDefinition(de.catma.tag.TagDefinition) SourceDocumentInfo(de.catma.document.source.SourceDocumentInfo) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) GitProjectManager(de.catma.repository.git.GitProjectManager) ContentInfoSet(de.catma.document.source.ContentInfoSet) IndexInfoSet(de.catma.document.source.IndexInfoSet) Version(de.catma.tag.Version) TechInfoSet(de.catma.document.source.TechInfoSet) Property(de.catma.tag.Property) GitTagsetHandler(de.catma.repository.git.GitTagsetHandler) UsernamePasswordCredentialsProvider(org.eclipse.jgit.transport.UsernamePasswordCredentialsProvider) JGitRepoManager(de.catma.repository.git.managers.JGitRepoManager) Range(de.catma.document.Range) PropertyDefinition(de.catma.tag.PropertyDefinition) FileInputStream(java.io.FileInputStream) Repository(org.eclipse.jgit.lib.Repository) Git(org.eclipse.jgit.api.Git) TagInstance(de.catma.tag.TagInstance) GitProjectHandler(de.catma.repository.git.GitProjectHandler) TagReference(de.catma.document.standoffmarkup.usermarkup.TagReference) File(java.io.File) IDGenerator(de.catma.util.IDGenerator)

Example 13 with ContentInfoSet

use of de.catma.document.source.ContentInfoSet in project catma by forTEXT.

the class CorpusImporter method importCorpus.

/**
 * !BACKGROUND THREAD! No direct UI code here!
 *
 * @param progressListener
 * @param corpusFile
 * @param documentMetadataList
 * @param tempDir
 * @param ui
 * @param project
 * @return
 * @throws Exception
 */
public Void importCorpus(final ProgressListener progressListener, final File corpusFile, final List<CorpusImportDocumentMetadata> documentMetadataList, final String tempDir, final UI ui, final Project project) throws Exception {
    progressListener.setProgress("Importing Corpus");
    GZIPInputStream gzipIs = new GZIPInputStream(new FileInputStream(corpusFile));
    try (TarArchiveInputStream taIs = new TarArchiveInputStream(gzipIs)) {
        TarArchiveEntry entry = taIs.getNextTarEntry();
        while (entry != null) {
            final String entryName = entry.getName();
            final String[] pathParts = entry.getName().split(Pattern.quote("/"));
            final String documentIdPart = pathParts[2];
            final String documentId = documentIdPart.substring(documentIdPart.indexOf("__") + 3);
            final String idUri = "catma://" + documentId;
            if (pathParts[3].equals("annotationcollections")) {
                progressListener.setProgress("Importing Collection %1$s", pathParts[4]);
                ui.accessSynchronously(() -> {
                    try {
                        final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
                        IOUtils.copy(taIs, buffer);
                        SourceDocument document = project.getSourceDocument(documentId);
                        Pair<AnnotationCollection, List<TagsetDefinitionImportStatus>> loadResult = project.loadAnnotationCollection(new ByteArrayInputStream(buffer.toByteArray()), document);
                        List<TagsetDefinitionImportStatus> tagsetDefinitionImportStatusList = loadResult.getSecond();
                        final AnnotationCollection annotationCollection = loadResult.getFirst();
                        Optional<TagsetDefinition> optIntrinsicTagset = annotationCollection.getTagLibrary().getTagsetDefinitions().stream().filter(tagsetDef -> tagsetDef.getName().equals("Intrinsic Markup")).findFirst();
                        if (optIntrinsicTagset.isPresent()) {
                            TagsetDefinition intrinsicTagset = optIntrinsicTagset.get();
                            List<TagReference> intrinsicAnnotations = annotationCollection.getTagReferences(intrinsicTagset);
                            if (!intrinsicAnnotations.isEmpty()) {
                                annotationCollection.removeTagReferences(intrinsicAnnotations);
                            }
                            annotationCollection.getTagLibrary().remove(intrinsicTagset);
                            tagsetDefinitionImportStatusList.stream().filter(status -> status.getTagset().equals(intrinsicTagset)).findFirst().ifPresent(status -> status.setDoImport(false));
                        }
                        tagsetDefinitionImportStatusList.stream().filter(status -> status.getTagset().isEmpty()).forEach(status -> status.setDoImport(false));
                        if (!annotationCollection.isEmpty()) {
                            project.importCollection(tagsetDefinitionImportStatusList, annotationCollection);
                        }
                    } catch (Exception e) {
                        Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, "Error importing the CATMA 5 Corpus: " + entryName, e);
                        String errorMsg = e.getMessage();
                        if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
                            errorMsg = "";
                        }
                        Notification.show("Error", String.format("Error importing the CATMA 5 Corpus! " + "This Collection will be skipped!\n The underlying error message was:\n%1$s", errorMsg), Type.ERROR_MESSAGE);
                    }
                });
            } else {
                final CorpusImportDocumentMetadata documentMetadata = documentMetadataList.stream().filter(metadata -> metadata.getSourceDocID().equals(idUri)).findFirst().orElse(null);
                final Locale locale = LocaleUtils.toLocale(documentMetadata.getSourceDocLocale());
                final boolean useApostrophe = Arrays.asList(documentMetadata.getSourceDocSepChars()).contains(String.valueOf(UploadFile.APOSTROPHE));
                final String title = (documentMetadata.getSourceDocName() == null || documentMetadata.getSourceDocName().isEmpty()) ? documentId : documentMetadata.getSourceDocName();
                progressListener.setProgress("Importing Document %1$s", title);
                final File tempFile = new File(new File(tempDir), documentId);
                if (tempFile.exists()) {
                    tempFile.delete();
                }
                try (FileOutputStream fos = new FileOutputStream(tempFile)) {
                    IOUtils.copy(taIs, fos);
                }
                ui.accessSynchronously(() -> {
                    IDGenerator idGenerator = new IDGenerator();
                    IndexInfoSet indexInfoSet = new IndexInfoSet(Collections.emptyList(), useApostrophe ? Lists.newArrayList(UploadFile.APOSTROPHE) : Collections.emptyList(), locale);
                    TechInfoSet techInfoSet = new TechInfoSet(documentId, FileType.TEXT.getMimeType(), tempFile.toURI());
                    ContentInfoSet contentInfoSet = new ContentInfoSet(documentMetadata.getSourceDocAuthor(), documentMetadata.getSourceDocDescription(), documentMetadata.getSourceDocPublisher(), title);
                    techInfoSet.setCharset(Charset.forName("UTF-8"));
                    SourceDocumentInfo documentInfo = new SourceDocumentInfo(indexInfoSet, contentInfoSet, techInfoSet);
                    AbstractSourceContentHandler handler = null;
                    boolean loadIntrinsicMarkup = false;
                    if (entryName.endsWith("xml2")) {
                        handler = new XML2ContentHandler();
                        loadIntrinsicMarkup = true;
                    } else if (entryName.endsWith("xml")) {
                        handler = new OldXMLContentHandler();
                        loadIntrinsicMarkup = true;
                    } else {
                        handler = new StandardContentHandler();
                    }
                    handler.setSourceDocumentInfo(documentInfo);
                    SourceDocument document = new SourceDocument(documentId, handler);
                    try {
                        project.insert(document, false);
                        if (loadIntrinsicMarkup) {
                            final TagManager tagmanager = new TagManager(new TagLibrary());
                            XmlMarkupCollectionSerializationHandler markupHandler = new XmlMarkupCollectionSerializationHandler(tagmanager, (XML2ContentHandler) handler, project.getUser().getIdentifier());
                            try (FileInputStream fis = new FileInputStream(tempFile)) {
                                AnnotationCollection intrinsicMarkupCollection = markupHandler.deserialize(document, idGenerator.generateCollectionId(), fis);
                                Collection<TagsetImport> tagsetImports = new ArrayList<TagsetImport>();
                                String defaultIntrinsicXMLElmentsName = "Default Intrinsic XML Elements";
                                for (TagsetDefinition tagset : tagmanager.getTagLibrary()) {
                                    if (!tagset.isEmpty()) {
                                        TagsetDefinition targetTagset = project.getTagManager().getTagLibrary().getTagsetDefinition(tagset.getUuid());
                                        boolean inProject = false;
                                        if (targetTagset == null) {
                                            targetTagset = tagset;
                                        } else {
                                            inProject = true;
                                        }
                                        String namespace = tagset.getName() == null ? "none" : tagset.getName();
                                        if (tagset.getName() == null) {
                                            tagset.setName(defaultIntrinsicXMLElmentsName);
                                        }
                                        TagsetImport tagsetImport = new TagsetImport(namespace, tagset, targetTagset, inProject ? TagsetImportState.WILL_BE_MERGED : TagsetImportState.WILL_BE_CREATED);
                                        tagsetImports.add(tagsetImport);
                                    }
                                }
                                // Creating Tagsets
                                tagsetImports.stream().filter(ti -> ti.getImportState().equals(TagsetImportState.WILL_BE_CREATED)).forEach(tagsetImport -> {
                                    if (project.getTagManager().getTagLibrary().getTagsetDefinition(tagsetImport.getTargetTagset().getUuid()) != null) {
                                        // already imported, so it will be a merge
                                        tagsetImport.setImportState(TagsetImportState.WILL_BE_MERGED);
                                    } else {
                                        TagsetDefinition extractedTagset = tagsetImport.getExtractedTagset();
                                        try {
                                            project.importTagsets(Collections.singletonList(new TagsetDefinitionImportStatus(extractedTagset, project.inProjectHistory(extractedTagset.getUuid()), project.getTagManager().getTagLibrary().getTagsetDefinition(extractedTagset.getUuid()) != null)));
                                        } catch (Exception e) {
                                            Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, String.format("Error importing tagset %1$s with ID %2$s", extractedTagset.getName(), extractedTagset.getUuid()), e);
                                            String errorMsg = e.getMessage();
                                            if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
                                                errorMsg = "";
                                            }
                                            Notification.show("Error", String.format("Error importing tagset %1$s! " + "This tagset will be skipped!\n The underlying error message was:\n%2$s", extractedTagset.getName(), errorMsg), Type.ERROR_MESSAGE);
                                        }
                                    }
                                });
                                // Merging Tagsets
                                tagsetImports.stream().filter(ti -> ti.getImportState().equals(TagsetImportState.WILL_BE_MERGED)).forEach(tagsetImport -> {
                                    TagsetDefinition targetTagset = project.getTagManager().getTagLibrary().getTagsetDefinition(tagsetImport.getTargetTagset().getUuid());
                                    for (TagDefinition tag : tagsetImport.getExtractedTagset()) {
                                        Optional<TagDefinition> optionalTag = targetTagset.getTagDefinitionsByName(tag.getName()).findFirst();
                                        if (optionalTag.isPresent()) {
                                            TagDefinition existingTag = optionalTag.get();
                                            tag.getUserDefinedPropertyDefinitions().forEach(pd -> {
                                                if (existingTag.getPropertyDefinition(pd.getName()) == null) {
                                                    project.getTagManager().addUserDefinedPropertyDefinition(existingTag, new PropertyDefinition(pd));
                                                }
                                            });
                                            List<TagReference> tagReferences = intrinsicMarkupCollection.getTagReferences(tag);
                                            intrinsicMarkupCollection.removeTagReferences(tagReferences);
                                            Multimap<TagInstance, TagReference> referencesByInstance = ArrayListMultimap.create();
                                            tagReferences.forEach(tr -> referencesByInstance.put(tr.getTagInstance(), tr));
                                            for (TagInstance incomingTagInstance : referencesByInstance.keySet()) {
                                                TagInstance newTagInstance = new TagInstance(idGenerator.generate(), existingTag.getUuid(), incomingTagInstance.getAuthor(), incomingTagInstance.getTimestamp(), existingTag.getUserDefinedPropertyDefinitions(), targetTagset.getUuid());
                                                for (Property oldProp : incomingTagInstance.getUserDefinedProperties()) {
                                                    String oldPropDefId = oldProp.getPropertyDefinitionId();
                                                    PropertyDefinition oldPropDef = tag.getPropertyDefinitionByUuid(oldPropDefId);
                                                    PropertyDefinition existingPropDef = existingTag.getPropertyDefinition(oldPropDef.getName());
                                                    newTagInstance.addUserDefinedProperty(new Property(existingPropDef.getUuid(), oldProp.getPropertyValueList()));
                                                }
                                                ArrayList<TagReference> newReferences = new ArrayList<>();
                                                referencesByInstance.get(incomingTagInstance).forEach(tr -> {
                                                    try {
                                                        newReferences.add(new TagReference(newTagInstance, tr.getTarget().toString(), tr.getRange(), tr.getUserMarkupCollectionUuid()));
                                                    } catch (URISyntaxException e) {
                                                        e.printStackTrace();
                                                    }
                                                });
                                                intrinsicMarkupCollection.addTagReferences(newReferences);
                                            }
                                        } else {
                                            tag.setTagsetDefinitionUuid(targetTagset.getUuid());
                                            project.getTagManager().addTagDefinition(targetTagset, tag);
                                        }
                                    }
                                });
                                project.importCollection(Collections.emptyList(), intrinsicMarkupCollection);
                            }
                            if (tempFile.exists()) {
                                tempFile.delete();
                            }
                        }
                    } catch (Exception e) {
                        Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, "Error importing the CATMA 5 Corpus: " + entryName, e);
                        String errorMsg = e.getMessage();
                        if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
                            errorMsg = "";
                        }
                        Notification.show("Error", String.format("Error importing the CATMA 5 Corpus! " + "This Document will be skipped!\n The underlying error message was:\n%1$s", errorMsg), Type.ERROR_MESSAGE);
                    }
                });
            }
            entry = taIs.getNextTarEntry();
        }
    }
    return null;
}
Also used : ArrayListMultimap(com.google.common.collect.ArrayListMultimap) AbstractSourceContentHandler(de.catma.document.source.contenthandler.AbstractSourceContentHandler) Arrays(java.util.Arrays) GZIPInputStream(java.util.zip.GZIPInputStream) URISyntaxException(java.net.URISyntaxException) UI(com.vaadin.ui.UI) XmlMarkupCollectionSerializationHandler(de.catma.serialization.intrinsic.xml.XmlMarkupCollectionSerializationHandler) TechInfoSet(de.catma.document.source.TechInfoSet) OldXMLContentHandler(de.catma.document.source.contenthandler.OldXMLContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Locale(java.util.Locale) TagsetImport(de.catma.ui.module.project.documentwizard.TagsetImport) Collection(java.util.Collection) IndexInfoSet(de.catma.document.source.IndexInfoSet) ProjectView(de.catma.ui.module.project.ProjectView) TagInstance(de.catma.tag.TagInstance) Logger(java.util.logging.Logger) SourceDocumentInfo(de.catma.document.source.SourceDocumentInfo) TagReference(de.catma.document.annotation.TagReference) List(java.util.List) Type(com.vaadin.ui.Notification.Type) TagDefinition(de.catma.tag.TagDefinition) Optional(java.util.Optional) FileType(de.catma.document.source.FileType) Pattern(java.util.regex.Pattern) ContentInfoSet(de.catma.document.source.ContentInfoSet) PropertyDefinition(de.catma.tag.PropertyDefinition) ByteArrayOutputStream(java.io.ByteArrayOutputStream) TagManager(de.catma.tag.TagManager) UploadFile(de.catma.ui.module.project.documentwizard.UploadFile) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) Multimap(com.google.common.collect.Multimap) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) LocaleUtils(org.apache.commons.lang3.LocaleUtils) Lists(com.google.common.collect.Lists) Charset(java.nio.charset.Charset) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) Notification(com.vaadin.ui.Notification) StandardContentHandler(de.catma.document.source.contenthandler.StandardContentHandler) TagsetDefinition(de.catma.tag.TagsetDefinition) Pair(de.catma.util.Pair) IDGenerator(de.catma.util.IDGenerator) TagLibrary(de.catma.tag.TagLibrary) ProgressListener(de.catma.backgroundservice.ProgressListener) XML2ContentHandler(de.catma.document.source.contenthandler.XML2ContentHandler) Property(de.catma.tag.Property) Project(de.catma.project.Project) FileOutputStream(java.io.FileOutputStream) IOUtils(org.apache.commons.compress.utils.IOUtils) FileInputStream(java.io.FileInputStream) SourceDocument(de.catma.document.source.SourceDocument) AnnotationCollection(de.catma.document.annotation.AnnotationCollection) File(java.io.File) TagsetDefinitionImportStatus(de.catma.serialization.TagsetDefinitionImportStatus) TagsetImportState(de.catma.ui.module.project.documentwizard.TagsetImportState) Collections(java.util.Collections) Locale(java.util.Locale) SourceDocumentInfo(de.catma.document.source.SourceDocumentInfo) ArrayList(java.util.ArrayList) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) IndexInfoSet(de.catma.document.source.IndexInfoSet) List(java.util.List) ArrayList(java.util.ArrayList) TechInfoSet(de.catma.document.source.TechInfoSet) TagLibrary(de.catma.tag.TagLibrary) AnnotationCollection(de.catma.document.annotation.AnnotationCollection) SourceDocument(de.catma.document.source.SourceDocument) XML2ContentHandler(de.catma.document.source.contenthandler.XML2ContentHandler) FileInputStream(java.io.FileInputStream) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) TagsetDefinition(de.catma.tag.TagsetDefinition) OldXMLContentHandler(de.catma.document.source.contenthandler.OldXMLContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) FileOutputStream(java.io.FileOutputStream) TagReference(de.catma.document.annotation.TagReference) StandardContentHandler(de.catma.document.source.contenthandler.StandardContentHandler) UploadFile(de.catma.ui.module.project.documentwizard.UploadFile) File(java.io.File) IDGenerator(de.catma.util.IDGenerator) TagsetImport(de.catma.ui.module.project.documentwizard.TagsetImport) TagDefinition(de.catma.tag.TagDefinition) URISyntaxException(java.net.URISyntaxException) GZIPInputStream(java.util.zip.GZIPInputStream) ContentInfoSet(de.catma.document.source.ContentInfoSet) ProjectView(de.catma.ui.module.project.ProjectView) Property(de.catma.tag.Property) AbstractSourceContentHandler(de.catma.document.source.contenthandler.AbstractSourceContentHandler) ByteArrayOutputStream(java.io.ByteArrayOutputStream) PropertyDefinition(de.catma.tag.PropertyDefinition) URISyntaxException(java.net.URISyntaxException) TagManager(de.catma.tag.TagManager) XmlMarkupCollectionSerializationHandler(de.catma.serialization.intrinsic.xml.XmlMarkupCollectionSerializationHandler) TagInstance(de.catma.tag.TagInstance) TagsetDefinitionImportStatus(de.catma.serialization.TagsetDefinitionImportStatus)

Example 14 with ContentInfoSet

use of de.catma.document.source.ContentInfoSet in project catma by forTEXT.

the class TeiSourceDocumentInfoSerializationHandler method deserialize.

private SourceDocumentInfo deserialize(TeiDocument teiDocument) {
    ContentInfoSet contentInfoSet = teiDocument.getContentInfoSet();
    TechInfoSet techInfoSet = teiDocument.getTechInfoset();
    IndexInfoSet indexInfoSet = teiDocument.getIndexInfoSet();
    return new SourceDocumentInfo(indexInfoSet, contentInfoSet, techInfoSet);
}
Also used : ContentInfoSet(de.catma.document.source.ContentInfoSet) SourceDocumentInfo(de.catma.document.source.SourceDocumentInfo) IndexInfoSet(de.catma.document.source.IndexInfoSet) TechInfoSet(de.catma.document.source.TechInfoSet)

Example 15 with ContentInfoSet

use of de.catma.document.source.ContentInfoSet in project catma by forTEXT.

the class XmlMarkupCollectionSerializationHandler method deserialize.

@Override
public AnnotationCollection deserialize(SourceDocument sourceDocument, String id, InputStream inputStream) throws IOException {
    try {
        Builder builder = new Builder();
        Document document = builder.build(inputStream);
        Map<String, String> namespacePrefixToTagsetIdMap = new HashMap<>();
        for (int idx = 0; idx < document.getRootElement().getNamespaceDeclarationCount(); idx++) {
            String prefix = document.getRootElement().getNamespacePrefix(idx);
            String namespaceURI = document.getRootElement().getNamespaceURI(prefix);
            if (namespaceURI != null && !namespaceURI.isEmpty()) {
                String tagsetId = idGenerator.generateTagsetId(namespaceURI);
                if (tagManager.getTagLibrary().getTagsetDefinition(tagsetId) == null) {
                    TagsetDefinition tagsetDefinition = new TagsetDefinition(tagsetId, namespaceURI, new Version());
                    tagManager.addTagsetDefinition(tagsetDefinition);
                }
                namespacePrefixToTagsetIdMap.put(prefix, tagsetId);
            }
        }
        String defaultIntrinsicXmlTagsetId = KnownTagsetDefinitionName.DEFAULT_INTRINSIC_XML.asTagsetId();
        StringBuilder contentBuilder = new StringBuilder();
        if (tagManager.getTagLibrary().getTagsetDefinition(defaultIntrinsicXmlTagsetId) == null) {
            TagsetDefinition tagsetDefinition = new TagsetDefinition(defaultIntrinsicXmlTagsetId, null, new Version());
            tagManager.addTagsetDefinition(tagsetDefinition);
        }
        Stack<String> elementStack = new Stack<String>();
        AnnotationCollection userMarkupCollection = new AnnotationCollection(id, new ContentInfoSet("", "Intrinsic Markup", "", DEFAULT_COLLECTION_TITLE), tagManager.getTagLibrary(), sourceDocument.getUuid(), sourceDocument.getRevisionHash());
        scanElements(contentBuilder, document.getRootElement(), elementStack, tagManager, tagManager.getTagLibrary(), namespacePrefixToTagsetIdMap, userMarkupCollection, sourceDocument.getUuid(), sourceDocument.getLength());
        return userMarkupCollection;
    } catch (Exception e) {
        throw new IOException(e);
    }
}
Also used : AnnotationCollection(de.catma.document.annotation.AnnotationCollection) HashMap(java.util.HashMap) Builder(nu.xom.Builder) IOException(java.io.IOException) SourceDocument(de.catma.document.source.SourceDocument) Document(nu.xom.Document) IOException(java.io.IOException) Stack(java.util.Stack) TagsetDefinition(de.catma.tag.TagsetDefinition) ContentInfoSet(de.catma.document.source.ContentInfoSet) Version(de.catma.tag.Version)

Aggregations

ContentInfoSet (de.catma.document.source.ContentInfoSet)17 File (java.io.File)12 ILocalGitRepositoryManager (de.catma.repository.git.interfaces.ILocalGitRepositoryManager)9 IndexInfoSet (de.catma.document.source.IndexInfoSet)6 SourceDocumentInfo (de.catma.document.source.SourceDocumentInfo)6 TechInfoSet (de.catma.document.source.TechInfoSet)6 IDGenerator (de.catma.util.IDGenerator)6 FileInputStream (java.io.FileInputStream)6 AnnotationCollection (de.catma.document.annotation.AnnotationCollection)4 JGitRepoManager (de.catma.repository.git.managers.JGitRepoManager)4 GitMarkupCollectionHeader (de.catma.repository.git.serialization.models.GitMarkupCollectionHeader)4 TagsetDefinition (de.catma.tag.TagsetDefinition)4 EventBus (com.google.common.eventbus.EventBus)3 BackgroundService (de.catma.backgroundservice.BackgroundService)3 SourceDocument (de.catma.document.source.SourceDocument)3 TermExtractor (de.catma.indexer.TermExtractor)3 GitLabServerManagerTest (de.catma.repository.git.managers.GitLabServerManagerTest)3 Property (de.catma.tag.Property)3 TagInstance (de.catma.tag.TagInstance)3 UsernamePasswordCredentialsProvider (org.eclipse.jgit.transport.UsernamePasswordCredentialsProvider)3